src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2136 2005-03-30 10:03:03Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/tables.h"
  45 #include "vm/utf8.h"
  46
  47
  48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  49
  50
  51 /* utf-symbols for pointer comparison of frequently used strings **************/
  52
  53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  54
  55 utf *utf_java_lang_Class;
  56 utf *utf_java_lang_ClassLoader;
  57 utf *utf_java_lang_Cloneable;
  58 utf *utf_java_lang_SecurityManager;
  59 utf *utf_java_lang_String;
  60 utf *utf_java_lang_System;
  61 utf *utf_java_io_Serializable;
  62
  63 utf *utf_java_lang_Throwable;
  64 utf *utf_java_lang_VMThrowable;
  65 utf *utf_java_lang_Exception;
  66 utf *utf_java_lang_Error;
  67 utf *utf_java_lang_OutOfMemoryError;
  68
  69 utf* utf_java_lang_Void;
  70 utf* utf_java_lang_Boolean;
  71 utf* utf_java_lang_Byte;
  72 utf* utf_java_lang_Character;
  73 utf* utf_java_lang_Short;
  74 utf* utf_java_lang_Integer;
  75 utf* utf_java_lang_Long;
  76 utf* utf_java_lang_Float;
  77 utf* utf_java_lang_Double;
  78
  79 utf *utf_java_util_Vector;
  80
  81 utf *utf_InnerClasses;                  /* InnerClasses                       */
  82 utf *utf_ConstantValue;                 /* ConstantValue                      */
  83 utf *utf_Code;                          /* Code                               */
  84 utf *utf_Exceptions;                    /* Exceptions                         */
  85 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  86 utf *utf_SourceFile;                    /* SourceFile                         */
  87
  88 utf *utf_init;                          /* <init>                             */
  89 utf *utf_clinit;                        /* <clinit>                           */
  90 utf *utf_finalize;                      /* finalize                           */
  91
  92 utf *utf_printStackTrace;
  93 utf *utf_fillInStackTrace;
  94 utf *utf_loadClass;
  95
  96 utf *utf_void__void;                    /* ()V                                */
  97 utf *utf_boolean__void;                 /* (Z)V                               */
  98 utf *utf_byte__void;                    /* (B)V                               */
  99 utf *utf_char__void;                    /* (C)V                               */
 100 utf *utf_short__void;                   /* (S)V                               */
 101 utf *utf_int__void;                     /* (I)V                               */
 102 utf *utf_long__void;                    /* (J)V                               */
 103 utf *utf_float__void;                   /* (F)V                               */
 104 utf *utf_double__void;                  /* (D)V                               */
 105 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 106 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 107 utf *utf_java_lang_String__java_lang_Class;
 108 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 109
 110 utf *array_packagename;
 111
 112
 113 /* utf_init ********************************************************************
 114
 115    Initializes the utf8 subsystem.
 116
 117 *******************************************************************************/
 118
 119 void utf8_init(void)
 120 {
 121         /* create utf-symbols for pointer comparison of frequently used strings */
 122
 123         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 124
 125         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 126         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 127         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 128         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 129         utf_java_lang_String           = utf_new_char("java/lang/String");
 130         utf_java_lang_System           = utf_new_char("java/lang/System");
 131         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 132
 133         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 134         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 135         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 136         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 137         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 138
 139         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 140         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 141         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 142         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 143         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 144         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 145         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 146         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 147         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 148
 149         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 150
 151         utf_InnerClasses               = utf_new_char("InnerClasses");
 152         utf_ConstantValue              = utf_new_char("ConstantValue");
 153         utf_Code                       = utf_new_char("Code");
 154         utf_Exceptions                 = utf_new_char("Exceptions");
 155         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 156         utf_SourceFile                 = utf_new_char("SourceFile");
 157
 158         utf_init                           = utf_new_char("<init>");
 159         utf_clinit                         = utf_new_char("<clinit>");
 160         utf_finalize                   = utf_new_char("finalize");
 161
 162         utf_printStackTrace            = utf_new_char("printStackTrace");
 163         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 164         utf_loadClass                  = utf_new_char("loadClass");
 165
 166         utf_void__void                 = utf_new_char("()V");
 167         utf_boolean__void              = utf_new_char("(Z)V");
 168         utf_byte__void                 = utf_new_char("(B)V");
 169         utf_char__void                 = utf_new_char("(C)V");
 170         utf_short__void                = utf_new_char("(S)V");
 171         utf_int__void                  = utf_new_char("(I)V");
 172         utf_long__void                 = utf_new_char("(J)V");
 173         utf_float__void                = utf_new_char("(F)V");
 174         utf_double__void               = utf_new_char("(D)V");
 175         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 176         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 177
 178         utf_java_lang_String__java_lang_Class =
 179                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 180
 181         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 182
 183         array_packagename              = utf_new_char("\t<the array package>");
 184 }
 185
 186
 187 /* utf_hashkey *****************************************************************
 188
 189    The hashkey is computed from the utf-text by using up to 8
 190    characters.  For utf-symbols longer than 15 characters 3 characters
 191    are taken from the beginning and the end, 2 characters are taken
 192    from the middle.
 193
 194 *******************************************************************************/
 195
 196 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 197 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 198
 199 u4 utf_hashkey(const char *text, u4 length)
 200 {
 201         const char *start_pos = text;       /* pointer to utf text                */
 202         u4 a;
 203
 204         switch (length) {
 205         case 0: /* empty string */
 206                 return 0;
 207
 208         case 1: return fbs(0);
 209         case 2: return fbs(0) ^ nbs(3);
 210         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 211         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 212         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 213         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 214         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 215         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 216
 217         case 9:
 218                 a = fbs(0);
 219                 a ^= nbs(1);
 220                 a ^= nbs(2);
 221                 text++;
 222                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 223
 224         case 10:
 225                 a = fbs(0);
 226                 text++;
 227                 a ^= nbs(2);
 228                 a ^= nbs(3);
 229                 a ^= nbs(4);
 230                 text++;
 231                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 232
 233         case 11:
 234                 a = fbs(0);
 235                 text++;
 236                 a ^= nbs(2);
 237                 a ^= nbs(3);
 238                 a ^= nbs(4);
 239                 text++;
 240                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 241
 242         case 12:
 243                 a = fbs(0);
 244                 text += 2;
 245                 a ^= nbs(2);
 246                 a ^= nbs(3);
 247                 text++;
 248                 a ^= nbs(5);
 249                 a ^= nbs(6);
 250                 a ^= nbs(7);
 251                 text++;
 252                 return a ^ nbs(9) ^ nbs(10);
 253
 254         case 13:
 255                 a = fbs(0);
 256                 a ^= nbs(1);
 257                 text++;
 258                 a ^= nbs(3);
 259                 a ^= nbs(4);
 260                 text += 2;
 261                 a ^= nbs(7);
 262                 a ^= nbs(8);
 263                 text += 2;
 264                 return a ^ nbs(9) ^ nbs(10);
 265
 266         case 14:
 267                 a = fbs(0);
 268                 text += 2;
 269                 a ^= nbs(3);
 270                 a ^= nbs(4);
 271                 text += 2;
 272                 a ^= nbs(7);
 273                 a ^= nbs(8);
 274                 text += 2;
 275                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 276
 277         case 15:
 278                 a = fbs(0);
 279                 text += 2;
 280                 a ^= nbs(3);
 281                 a ^= nbs(4);
 282                 text += 2;
 283                 a ^= nbs(7);
 284                 a ^= nbs(8);
 285                 text += 2;
 286                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 287
 288         default:  /* 3 characters from beginning */
 289                 a = fbs(0);
 290                 text += 2;
 291                 a ^= nbs(3);
 292                 a ^= nbs(4);
 293
 294                 /* 2 characters from middle */
 295                 text = start_pos + (length / 2);
 296                 a ^= fbs(5);
 297                 text += 2;
 298                 a ^= nbs(6);
 299
 300                 /* 3 characters from end */
 301                 text = start_pos + length - 4;
 302
 303                 a ^= fbs(7);
 304                 text++;
 305
 306                 return a ^ nbs(10) ^ nbs(11);
 307     }
 308 }
 309
 310
 311 /* utf_hashkey *****************************************************************
 312
 313    Compute the hashkey of a unicode string.
 314
 315 *******************************************************************************/
 316
 317 u4 unicode_hashkey(u2 *text, u2 len)
 318 {
 319         return utf_hashkey((char *) text, len);
 320 }
 321
 322
 323 /* utf_new *********************************************************************
 324
 325    Creates a new utf-symbol, the text of the symbol is passed as a
 326    u1-array. The function searches the utf-hashtable for a utf-symbol
 327    with this text. On success the element returned, otherwise a new
 328    hashtable element is created.
 329
 330    If the number of entries in the hashtable exceeds twice the size of
 331    the hashtable slots a reorganization of the hashtable is done and
 332    the utf symbols are copied to a new hashtable with doubled size.
 333
 334 *******************************************************************************/
 335
 336 utf *utf_new_intern(const char *text, u2 length);
 337
 338 utf *utf_new(const char *text, u2 length)
 339 {
 340     utf *r;
 341
 342 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 343     tables_lock();
 344 #endif
 345
 346     r = utf_new_intern(text, length);
 347
 348 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 349     tables_unlock();
 350 #endif
 351
 352     return r;
 353 }
 354
 355
 356 utf *utf_new_intern(const char *text, u2 length)
 357 {
 358         u4 key;                             /* hashkey computed from utf-text     */
 359         u4 slot;                            /* slot in hashtable                  */
 360         utf *u;                             /* hashtable element                  */
 361         u2 i;
 362
 363 #ifdef STATISTICS
 364         if (opt_stat)
 365                 count_utf_new++;
 366 #endif
 367
 368         key  = utf_hashkey(text, length);
 369         slot = key & (utf_hash.size - 1);
 370         u    = utf_hash.ptr[slot];
 371
 372         /* search external hash chain for utf-symbol */
 373         while (u) {
 374                 if (u->blength == length) {
 375
 376                         /* compare text of hashtable elements */
 377                         for (i = 0; i < length; i++)
 378                                 if (text[i] != u->text[i]) goto nomatch;
 379
 380 #ifdef STATISTICS
 381                         if (opt_stat)
 382                                 count_utf_new_found++;
 383 #endif
 384
 385                         /* symbol found in hashtable */
 386                         return u;
 387                 }
 388         nomatch:
 389                 u = u->hashlink; /* next element in external chain */
 390         }
 391
 392 #ifdef STATISTICS
 393         if (opt_stat)
 394                 count_utf_len += sizeof(utf) + length;
 395 #endif
 396
 397         /* location in hashtable found, create new utf element */
 398         u = NEW(utf);
 399         u->blength  = length;               /* length in bytes of utfstring       */
 400         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 401         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 402         memcpy(u->text, text, length);      /* copy utf-text                      */
 403         u->text[length] = '\0';
 404         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 405
 406         utf_hash.entries++;                 /* update number of entries           */
 407
 408         if (utf_hash.entries > (utf_hash.size * 2)) {
 409
 410         /* reorganization of hashtable, average length of
 411            the external chains is approx. 2                */
 412
 413                 u4 i;
 414                 utf *u;
 415                 hashtable newhash; /* the new hashtable */
 416
 417                 /* create new hashtable, double the size */
 418                 init_hashtable(&newhash, utf_hash.size * 2);
 419                 newhash.entries = utf_hash.entries;
 420
 421 #ifdef STATISTICS
 422                 if (opt_stat)
 423                         count_utf_len += sizeof(utf*) * utf_hash.size;
 424 #endif
 425
 426                 /* transfer elements to new hashtable */
 427                 for (i = 0; i < utf_hash.size; i++) {
 428                         u = (utf *) utf_hash.ptr[i];
 429                         while (u) {
 430                                 utf *nextu = u->hashlink;
 431                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 432
 433                                 u->hashlink = (utf *) newhash.ptr[slot];
 434                                 newhash.ptr[slot] = u;
 435
 436                                 /* follow link in external hash chain */
 437                                 u = nextu;
 438                         }
 439                 }
 440
 441                 /* dispose old table */
 442                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 443                 utf_hash = newhash;
 444         }
 445
 446         return u;
 447 }
 448
 449
 450 /* utf_new_u2 ******************************************************************
 451
 452    Make utf symbol from u2 array, if isclassname is true '.' is
 453    replaced by '/'.
 454
 455 *******************************************************************************/
 456
 457 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 458 {
 459         char *buffer;                   /* memory buffer for  unicode characters  */
 460         char *pos;                      /* pointer to current position in buffer  */
 461         u4 left;                        /* unicode characters left                */
 462         u4 buflength;                   /* utf length in bytes of the u2 array    */
 463         utf *result;                    /* resulting utf-string                   */
 464         int i;
 465
 466         /* determine utf length in bytes and allocate memory */
 467
 468         buflength = u2_utflength(unicode_pos, unicode_length);
 469         buffer    = MNEW(char, buflength);
 470
 471         left = buflength;
 472         pos  = buffer;
 473
 474         for (i = 0; i++ < unicode_length; unicode_pos++) {
 475                 /* next unicode character */
 476                 u2 c = *unicode_pos;
 477
 478                 if ((c != 0) && (c < 0x80)) {
 479                         /* 1 character */
 480                         left--;
 481                 if ((int) left < 0) break;
 482                         /* convert classname */
 483                         if (isclassname && c == '.')
 484                                 *pos++ = '/';
 485                         else
 486                                 *pos++ = (char) c;
 487
 488                 } else if (c < 0x800) {
 489                         /* 2 characters */
 490                 unsigned char high = c >> 6;
 491                 unsigned char low  = c & 0x3F;
 492                         left = left - 2;
 493                 if ((int) left < 0) break;
 494                 *pos++ = high | 0xC0;
 495                 *pos++ = low  | 0x80;
 496
 497                 } else {
 498                 /* 3 characters */
 499                 char low  = c & 0x3f;
 500                 char mid  = (c >> 6) & 0x3F;
 501                 char high = c >> 12;
 502                         left = left - 3;
 503                 if ((int) left < 0) break;
 504                 *pos++ = high | 0xE0;
 505                 *pos++ = mid  | 0x80;
 506                 *pos++ = low  | 0x80;
 507                 }
 508         }
 509
 510         /* insert utf-string into symbol-table */
 511         result = utf_new(buffer,buflength);
 512
 513         MFREE(buffer, char, buflength);
 514
 515         return result;
 516 }
 517
 518
 519 /* utf_new_char ****************************************************************
 520
 521    Creates a new utf symbol, the text for this symbol is passed as a
 522    c-string ( = char* ).
 523
 524 *******************************************************************************/
 525
 526 utf *utf_new_char(const char *text)
 527 {
 528         return utf_new(text, strlen(text));
 529 }
 530
 531
 532 /* utf_new_char_classname ******************************************************
 533
 534    Creates a new utf symbol, the text for this symbol is passed as a
 535    c-string ( = char* ) "." characters are going to be replaced by
 536    "/". Since the above function is used often, this is a separte
 537    function, instead of an if.
 538
 539 *******************************************************************************/
 540
 541 utf *utf_new_char_classname(const char *text)
 542 {
 543         if (strchr(text, '.')) {
 544                 char *txt = strdup(text);
 545                 char *end = txt + strlen(txt);
 546                 char *c;
 547                 utf *tmpRes;
 548
 549                 for (c = txt; c < end; c++)
 550                         if (*c == '.') *c = '/';
 551
 552                 tmpRes = utf_new(txt, strlen(txt));
 553                 FREE(txt, 0);
 554
 555                 return tmpRes;
 556
 557         } else
 558                 return utf_new(text, strlen(text));
 559 }
 560
 561
 562 /* utf_nextu2 ******************************************************************
 563
 564    Read the next unicode character from the utf string and increment
 565    the utf-string pointer accordingly.
 566
 567 *******************************************************************************/
 568
 569 u2 utf_nextu2(char **utf_ptr)
 570 {
 571     /* uncompressed unicode character */
 572     u2 unicode_char = 0;
 573     /* current position in utf text */
 574     unsigned char *utf = (unsigned char *) (*utf_ptr);
 575     /* bytes representing the unicode character */
 576     unsigned char ch1, ch2, ch3;
 577     /* number of bytes used to represent the unicode character */
 578     int len = 0;
 579
 580     switch ((ch1 = utf[0]) >> 4) {
 581         default: /* 1 byte */
 582                 (*utf_ptr)++;
 583                 return (u2) ch1;
 584         case 0xC:
 585         case 0xD: /* 2 bytes */
 586                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 587                         unsigned char high = ch1 & 0x1F;
 588                         unsigned char low  = ch2 & 0x3F;
 589                         unicode_char = (high << 6) + low;
 590                         len = 2;
 591                 }
 592                 break;
 593
 594         case 0xE: /* 2 or 3 bytes */
 595                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 596                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 597                                 unsigned char low  = ch3 & 0x3f;
 598                                 unsigned char mid  = ch2 & 0x3f;
 599                                 unsigned char high = ch1 & 0x0f;
 600                                 unicode_char = (((high << 6) + mid) << 6) + low;
 601                                 len = 3;
 602                         } else
 603                                 len = 2;
 604                 }
 605                 break;
 606     }
 607
 608     /* update position in utf-text */
 609     *utf_ptr = (char *) (utf + len);
 610
 611     return unicode_char;
 612 }
 613
 614
 615 /* utf_strlen ******************************************************************
 616
 617    Determine number of unicode characters in the utf string.
 618
 619 *******************************************************************************/
 620
 621 u4 utf_strlen(utf *u)
 622 {
 623         char *endpos;                       /* points behind utf string           */
 624         char *utf_ptr;                      /* current position in utf text       */
 625         u4 len = 0;                         /* number of unicode characters       */
 626
 627         if (!u) {
 628                 *exceptionptr = new_nullpointerexception();
 629                 return 0;
 630         }
 631
 632         endpos = utf_end(u);
 633         utf_ptr = u->text;
 634
 635         while (utf_ptr < endpos) {
 636                 len++;
 637                 /* next unicode character */
 638                 utf_nextu2(&utf_ptr);
 639         }
 640
 641         if (utf_ptr != endpos)
 642                 /* string ended abruptly */
 643                 throw_cacao_exception_exit(string_java_lang_InternalError,
 644                                                                    "Illegal utf8 string");
 645
 646         return len;
 647 }
 648
 649
 650 /* u2_utflength ****************************************************************
 651
 652    Returns the utf length in bytes of a u2 array.
 653
 654 *******************************************************************************/
 655
 656 u4 u2_utflength(u2 *text, u4 u2_length)
 657 {
 658         u4 result_len = 0;                  /* utf length in bytes                */
 659         u2 ch;                              /* current unicode character          */
 660         u4 len;
 661
 662         for (len = 0; len < u2_length; len++) {
 663                 /* next unicode character */
 664                 ch = *text++;
 665
 666                 /* determine bytes required to store unicode character as utf */
 667                 if (ch && (ch < 0x80))
 668                         result_len++;
 669                 else if (ch < 0x800)
 670                         result_len += 2;
 671                 else
 672                         result_len += 3;
 673         }
 674
 675     return result_len;
 676 }
 677
 678
 679 /* utf_display *****************************************************************
 680
 681    Write utf symbol to stdout (for debugging purposes).
 682
 683 *******************************************************************************/
 684
 685 void utf_display(utf *u)
 686 {
 687         char *endpos;                       /* points behind utf string           */
 688         char *utf_ptr;                      /* current position in utf text       */
 689
 690         if (!u) {
 691                 printf("NULL");
 692                 fflush(stdout);
 693                 return;
 694         }
 695
 696         endpos = utf_end(u);
 697         utf_ptr = u->text;
 698
 699         while (utf_ptr < endpos) {
 700                 /* read next unicode character */
 701                 u2 c = utf_nextu2(&utf_ptr);
 702                 if (c >= 32 && c <= 127) printf("%c", c);
 703                 else printf("?");
 704         }
 705
 706         fflush(stdout);
 707 }
 708
 709
 710 /* utf_display_classname *******************************************************
 711
 712    Write utf symbol to stdout with `/' converted to `.' (for debugging
 713    purposes).
 714
 715 *******************************************************************************/
 716
 717 void utf_display_classname(utf *u)
 718 {
 719         char *endpos;                       /* points behind utf string           */
 720         char *utf_ptr;                      /* current position in utf text       */
 721
 722         if (!u) {
 723                 printf("NULL");
 724                 fflush(stdout);
 725                 return;
 726         }
 727
 728         endpos = utf_end(u);
 729         utf_ptr = u->text;
 730
 731         while (utf_ptr < endpos) {
 732                 /* read next unicode character */
 733                 u2 c = utf_nextu2(&utf_ptr);
 734                 if (c == '/') c = '.';
 735                 if (c >= 32 && c <= 127) printf("%c", c);
 736                 else printf("?");
 737         }
 738
 739         fflush(stdout);
 740 }
 741
 742
 743 /* utf_sprint ******************************************************************
 744
 745    Write utf symbol into c-string (for debugging purposes).
 746
 747 *******************************************************************************/
 748
 749 void utf_sprint(char *buffer, utf *u)
 750 {
 751         char *endpos;                       /* points behind utf string           */
 752         char *utf_ptr;                      /* current position in utf text       */
 753         u2 pos = 0;                         /* position in c-string               */
 754
 755         if (!u) {
 756                 strcpy(buffer, "NULL");
 757                 return;
 758         }
 759
 760         endpos = utf_end(u);
 761         utf_ptr = u->text;
 762
 763         while (utf_ptr < endpos)
 764                 /* copy next unicode character */
 765                 buffer[pos++] = utf_nextu2(&utf_ptr);
 766
 767         /* terminate string */
 768         buffer[pos] = '\0';
 769 }
 770
 771
 772 /* utf_sprint_classname ********************************************************
 773
 774    Write utf symbol into c-string with `/' converted to `.' (for debugging
 775    purposes).
 776
 777 *******************************************************************************/
 778
 779 void utf_sprint_classname(char *buffer, utf *u)
 780 {
 781         char *endpos;                       /* points behind utf string           */
 782         char *utf_ptr;                      /* current position in utf text       */
 783         u2 pos = 0;                         /* position in c-string               */
 784
 785         if (!u) {
 786                 strcpy(buffer, "NULL");
 787                 return;
 788         }
 789
 790         endpos = utf_end(u);
 791         utf_ptr = u->text;
 792
 793         while (utf_ptr < endpos) {
 794                 /* copy next unicode character */
 795                 u2 c = utf_nextu2(&utf_ptr);
 796                 if (c == '/') c = '.';
 797                 buffer[pos++] = c;
 798         }
 799
 800         /* terminate string */
 801         buffer[pos] = '\0';
 802 }
 803
 804
 805 /* utf_strcat ******************************************************************
 806
 807    Like libc strcat, but uses an utf8 string.
 808
 809 *******************************************************************************/
 810
 811 void utf_strcat(char *buffer, utf *u)
 812 {
 813         utf_sprint(buffer + strlen(buffer), u);
 814 }
 815
 816
 817 /* utf_strcat_classname ********************************************************
 818
 819    Like libc strcat, but uses an utf8 string.
 820
 821 *******************************************************************************/
 822
 823 void utf_strcat_classname(char *buffer, utf *u)
 824 {
 825         utf_sprint_classname(buffer + strlen(buffer), u);
 826 }
 827
 828
 829 /* utf_fprint ******************************************************************
 830
 831    Write utf symbol into file.
 832
 833 *******************************************************************************/
 834
 835 void utf_fprint(FILE *file, utf *u)
 836 {
 837         char *endpos;                       /* points behind utf string           */
 838         char *utf_ptr;                      /* current position in utf text       */
 839
 840         if (!u)
 841                 return;
 842
 843         endpos = utf_end(u);
 844         utf_ptr = u->text;
 845
 846         while (utf_ptr < endpos) {
 847                 /* read next unicode character */
 848                 u2 c = utf_nextu2(&utf_ptr);
 849
 850                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 851                 else fprintf(file, "?");
 852         }
 853 }
 854
 855
 856 /* utf_fprint_classname ********************************************************
 857
 858    Write utf symbol into file with `/' converted to `.'.
 859
 860 *******************************************************************************/
 861
 862 void utf_fprint_classname(FILE *file, utf *u)
 863 {
 864         char *endpos;                       /* points behind utf string           */
 865         char *utf_ptr;                      /* current position in utf text       */
 866
 867     if (!u)
 868                 return;
 869
 870         endpos = utf_end(u);
 871         utf_ptr = u->text;
 872
 873         while (utf_ptr < endpos) {
 874                 /* read next unicode character */
 875                 u2 c = utf_nextu2(&utf_ptr);
 876                 if (c == '/') c = '.';
 877
 878                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 879                 else fprintf(file, "?");
 880         }
 881 }
 882
 883
 884 /* is_valid_utf ****************************************************************
 885
 886    Return true if the given string is a valid UTF-8 string.
 887
 888    utf_ptr...points to first character
 889    end_pos...points after last character
 890
 891 *******************************************************************************/
 892
 893 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 894
 895 bool is_valid_utf(char *utf_ptr, char *end_pos)
 896 {
 897         int bytes;
 898         int len,i;
 899         char c;
 900         unsigned long v;
 901
 902         if (end_pos < utf_ptr) return false;
 903         bytes = end_pos - utf_ptr;
 904         while (bytes--) {
 905                 c = *utf_ptr++;
 906
 907                 if (!c) return false;                     /* 0x00 is not allowed */
 908                 if ((c & 0x80) == 0) continue;            /* ASCII */
 909
 910                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 911                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 912                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 913                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 914                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 915                 else return false;                        /* invalid leading byte */
 916
 917                 if (len > 2) return false;                /* Java limitation */
 918
 919                 v = (unsigned long)c & (0x3f >> len);
 920
 921                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 922
 923                 for (i = len; i--; ) {
 924                         c = *utf_ptr++;
 925                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 926                                 return false;
 927                         v = (v << 6) | (c & 0x3f);
 928                 }
 929
 930                 if (v == 0) {
 931                         if (len != 1) return false;           /* Java special */
 932
 933                 } else {
 934                         /* Sun Java seems to allow overlong UTF-8 encodings */
 935
 936                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 937                                 if (!opt_liberalutf)
 938                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 939                                 /* XXX change this to panic? */
 940                         }
 941                 }
 942
 943                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 944                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 945
 946                 /* even these seem to be allowed */
 947                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 948         }
 949
 950         return true;
 951 }
 952
 953
 954 /* is_valid_name ***************************************************************
 955
 956    Return true if the given string may be used as a class/field/method
 957    name. (Currently this only disallows empty strings and control
 958    characters.)
 959
 960    NOTE: The string is assumed to have passed is_valid_utf!
 961
 962    utf_ptr...points to first character
 963    end_pos...points after last character
 964
 965 *******************************************************************************/
 966
 967 bool is_valid_name(char *utf_ptr, char *end_pos)
 968 {
 969         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 970
 971         while (utf_ptr < end_pos) {
 972                 unsigned char c = *utf_ptr++;
 973
 974                 if (c < 0x20) return false; /* disallow control characters */
 975                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
 976                         return false;
 977         }
 978
 979         return true;
 980 }
 981
 982 bool is_valid_name_utf(utf *u)
 983 {
 984         return is_valid_name(u->text,utf_end(u));
 985 }
 986
 987
 988 /* utf_show ********************************************************************
 989
 990    Writes the utf symbols in the utfhash to stdout and displays the
 991    number of external hash chains grouped according to the chainlength
 992    (for debugging purposes).
 993
 994 *******************************************************************************/
 995
 996 void utf_show(void)
 997 {
 998
 999 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1000
1001         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1002         u4 max_chainlength = 0;      /* maximum length of the chains */
1003         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1004         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1005         u4 i;
1006
1007         printf ("UTF-HASH:\n");
1008
1009         /* show element of utf-hashtable */
1010         for (i=0; i<utf_hash.size; i++) {
1011                 utf *u = utf_hash.ptr[i];
1012                 if (u) {
1013                         printf ("SLOT %d: ", (int) i);
1014                         while (u) {
1015                                 printf ("'");
1016                                 utf_display (u);
1017                                 printf ("' ");
1018                                 u = u->hashlink;
1019                         }
1020                         printf ("\n");
1021                 }
1022
1023         }
1024
1025         printf ("UTF-HASH: %d slots for %d entries\n",
1026                         (int) utf_hash.size, (int) utf_hash.entries );
1027
1028
1029         if (utf_hash.entries == 0)
1030                 return;
1031
1032         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1033
1034         for (i=0;i<CHAIN_LIMIT;i++)
1035                 chain_count[i]=0;
1036
1037         /* count numbers of hashchains according to their length */
1038         for (i=0; i<utf_hash.size; i++) {
1039
1040                 utf *u = (utf*) utf_hash.ptr[i];
1041                 u4 chain_length = 0;
1042
1043                 /* determine chainlength */
1044                 while (u) {
1045                         u = u->hashlink;
1046                         chain_length++;
1047                 }
1048
1049                 /* update sum of all chainlengths */
1050                 sum_chainlength+=chain_length;
1051
1052                 /* determine the maximum length of the chains */
1053                 if (chain_length>max_chainlength)
1054                         max_chainlength = chain_length;
1055
1056                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1057                 if (chain_length>=CHAIN_LIMIT) {
1058                         beyond_limit+=chain_length;
1059                         chain_length=CHAIN_LIMIT-1;
1060                 }
1061
1062                 /* update number of hashchains of current length */
1063                 chain_count[chain_length]++;
1064         }
1065
1066         /* display results */
1067         for (i=1;i<CHAIN_LIMIT-1;i++)
1068                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1069
1070         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1071
1072
1073         printf("max. chainlength:%5d\n",max_chainlength);
1074
1075         /* avg. chainlength = sum of chainlengths / number of chains */
1076         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1077 }
1078
1079
1080 /*
1081  * These are local overrides for various environment variables in Emacs.
1082  * Please do not remove this and leave it at the end of the file, where
1083  * Emacs will automagically detect them.
1084  * ---------------------------------------------------------------------
1085  * Local variables:
1086  * mode: c
1087  * indent-tabs-mode: t
1088  * c-basic-offset: 4
1089  * tab-width: 4
1090  * End:
1091  */