mono/utils/strenc.c

   1 /*
   2  * strenc.c: string encoding conversions
   3  *
   4  * Author:
   5  *      Dick Porter (dick@ximian.com)
   6  *
   7  * (C) 2003 Ximian, Inc.
   8  */
   9
  10 #include <config.h>
  11 #include <glib.h>
  12 #include <string.h>
  13
  14 #include "strenc.h"
  15
  16 static const char trailingBytesForUTF8[256] = {
  17         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  18         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  19         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  20         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  21         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  22         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  23         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  24         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
  25 };
  26
  27 /**
  28  * mono_unicode_from_external:
  29  * @in: pointers to the buffer.
  30  * @bytes: number of bytes in the string.
  31  *
  32  * Tries to turn a NULL-terminated string into UTF16.
  33  *
  34  * First, see if it's valid UTF8, in which case just turn it directly
  35  * into UTF16.  Next, run through the colon-separated encodings in
  36  * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
  37  * returning the first successful conversion to UTF16.  If no
  38  * conversion succeeds, return NULL.
  39  *
  40  * Callers must free the returned string if not NULL. bytes holds the number
  41  * of bytes in the returned string, not including the terminator.
  42  */
  43 gunichar2 *
  44 mono_unicode_from_external (const gchar *in, gsize *bytes)
  45 {
  46         gchar *res=NULL;
  47         gchar **encodings;
  48         gchar *encoding_list;
  49         int i;
  50         glong lbytes;
  51
  52         if(in==NULL) {
  53                 return(NULL);
  54         }
  55
  56         encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
  57         if(encoding_list==NULL) {
  58                 encoding_list = g_strdup("");
  59         }
  60
  61         encodings=g_strsplit (encoding_list, ":", 0);
  62         g_free (encoding_list);
  63         for(i=0;encodings[i]!=NULL; i++) {
  64                 /* "default_locale" is a special case encoding */
  65                 if(!strcmp (encodings[i], "default_locale")) {
  66                         gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
  67                         if(utf8!=NULL) {
  68                                 res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
  69                                 *bytes = (gsize) lbytes;
  70                         }
  71                         g_free (utf8);
  72                 } else {
  73                         /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
  74                         res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
  75                         if (res != NULL) {
  76                                 gchar *ptr = res;
  77                                 res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
  78                                 *bytes = (gsize) lbytes;
  79                                 g_free (ptr);
  80                         }
  81                 }
  82
  83                 if(res!=NULL) {
  84                         g_strfreev (encodings);
  85                         *bytes *= 2;
  86                         return((gunichar2 *)res);
  87                 }
  88         }
  89
  90         g_strfreev (encodings);
  91
  92         if(g_utf8_validate (in, -1, NULL)) {
  93                 gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, (glong *)bytes, NULL);
  94                 *bytes *= 2;
  95                 return(unires);
  96         }
  97
  98         return(NULL);
  99 }
 100
 101 /**
 102  * mono_utf8_from_external:
 103  * @in: pointer to the string buffer.
 104  *
 105  * Tries to turn a NULL-terminated string into UTF8.
 106  *
 107  * First, see if it's valid UTF8, in which case there's nothing more
 108  * to be done.  Next, run through the colon-separated encodings in
 109  * MONO_EXTERNAL_ENCODINGS and do an iconv conversion on each,
 110  * returning the first successful conversion to utf8.  If no
 111  * conversion succeeds, return NULL.
 112  *
 113  * Callers must free the returned string if not NULL.
 114  *
 115  * This function is identical to mono_unicode_from_external, apart
 116  * from returning utf8 not utf16; it's handy in a few places to work
 117  * in utf8.
 118  */
 119 gchar *mono_utf8_from_external (const gchar *in)
 120 {
 121         gchar *res=NULL;
 122         gchar **encodings;
 123         gchar *encoding_list;
 124         int i;
 125
 126         if(in==NULL) {
 127                 return(NULL);
 128         }
 129
 130         encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
 131         if(encoding_list==NULL) {
 132                 encoding_list = g_strdup("");
 133         }
 134
 135         encodings=g_strsplit (encoding_list, ":", 0);
 136         g_free (encoding_list);
 137         for(i=0;encodings[i]!=NULL; i++) {
 138
 139                 /* "default_locale" is a special case encoding */
 140                 if(!strcmp (encodings[i], "default_locale")) {
 141                         res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
 142                         if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
 143                                 g_free (res);
 144                                 res=NULL;
 145                         }
 146                 } else {
 147                         res=g_convert (in, -1, "UTF8", encodings[i], NULL,
 148                                        NULL, NULL);
 149                 }
 150
 151                 if(res!=NULL) {
 152                         g_strfreev (encodings);
 153                         return(res);
 154                 }
 155         }
 156
 157         g_strfreev (encodings);
 158
 159         if(g_utf8_validate (in, -1, NULL)) {
 160                 return(g_strdup (in));
 161         }
 162
 163         return(NULL);
 164 }
 165
 166 /**
 167  * mono_unicode_to_external:
 168  * @uni: an UTF16 string to conver to an external representation.
 169  *
 170  * Turns NULL-terminated UTF16 into either UTF8, or the first
 171  * working item in MONO_EXTERNAL_ENCODINGS if set.  If no conversions
 172  * work, then UTF8 is returned.
 173  *
 174  * Callers must free the returned string.
 175  */
 176 gchar *mono_unicode_to_external (const gunichar2 *uni)
 177 {
 178         gchar *utf8;
 179         gchar *encoding_list;
 180
 181         /* Turn the unicode into utf8 to start with, because its
 182          * easier to work with gchar * than gunichar2 *
 183          */
 184         utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, NULL);
 185         g_assert (utf8!=NULL);
 186
 187         encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
 188         if(encoding_list==NULL) {
 189                 /* Do UTF8 */
 190                 return(utf8);
 191         } else {
 192                 gchar *res, **encodings;
 193                 int i;
 194
 195                 encodings=g_strsplit (encoding_list, ":", 0);
 196                 g_free (encoding_list);
 197                 for(i=0; encodings[i]!=NULL; i++) {
 198                         if(!strcmp (encodings[i], "default_locale")) {
 199                                 res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
 200                                                         NULL);
 201                         } else {
 202                                 res=g_convert (utf8, -1, encodings[i], "UTF8",
 203                                                NULL, NULL, NULL);
 204                         }
 205
 206                         if(res!=NULL) {
 207                                 g_free (utf8);
 208                                 g_strfreev (encodings);
 209
 210                                 return(res);
 211                         }
 212                 }
 213
 214                 g_strfreev (encodings);
 215         }
 216
 217         /* Nothing else worked, so just return the utf8 */
 218         return(utf8);
 219 }
 220
 221 /**
 222  * mono_utf8_validate_and_len
 223  * @source: Pointer to putative UTF-8 encoded string.
 224  *
 225  * Checks @source for being valid UTF-8. @utf is assumed to be
 226  * null-terminated.
 227  *
 228  * Return value: true if @source is valid.
 229  * oEnd : will equal the null terminator at the end of the string if valid.
 230  *            if not valid, it will equal the first charater of the invalid sequence.
 231  * oLengh : will equal the length to @oEnd
 232  **/
 233 gboolean
 234 mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
 235 {
 236         gboolean retVal = TRUE;
 237         gboolean lastRet = TRUE;
 238         guchar* ptr = (guchar*) source;
 239         guchar* srcPtr;
 240         guint length;
 241         guchar a;
 242         *oLength = 0;
 243         while (*ptr != 0) {
 244                 length = trailingBytesForUTF8 [*ptr] + 1;
 245                 srcPtr = (guchar*) ptr + length;
 246                 switch (length) {
 247                 default: retVal = FALSE;
 248                 /* Everything else falls through when "TRUE"... */
 249                 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 250                                 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
 251                                 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
 252                                         *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
 253                                         retVal = FALSE;
 254                                 }
 255                 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 256                 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 257
 258                 switch (*ptr) {
 259                 /* no fall-through in this inner switch */
 260                 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
 261                 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
 262                 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
 263                                    if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
 264                 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
 265                 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
 266                 default:   if (a < (guchar) 0x80) retVal = FALSE;
 267                 }
 268
 269                 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
 270                 }
 271                 if (*ptr > (guchar) 0xF4)
 272                         retVal = FALSE;
 273                 //If the string is invalid, set the end to the invalid byte.
 274                 if (!retVal && lastRet) {
 275                         if (oEnd != NULL)
 276                                 *oEnd = (gchar*) ptr;
 277                         lastRet = FALSE;
 278                 }
 279                 ptr += length;
 280                 (*oLength)++;
 281         }
 282         if (retVal && oEnd != NULL)
 283                 *oEnd = (gchar*) ptr;
 284         return retVal;
 285 }
 286
 287
 288 /**
 289  * mono_utf8_validate_and_len_with_bounds
 290  * @source: Pointer to putative UTF-8 encoded string.
 291  * @max_bytes: Max number of bytes that can be decoded. This function returns FALSE if
 292  * it needs to decode characters beyond that.
 293  *
 294  * Checks @source for being valid UTF-8. @utf is assumed to be
 295  * null-terminated.
 296  *
 297  * Return value: true if @source is valid.
 298  * oEnd : will equal the null terminator at the end of the string if valid.
 299  *            if not valid, it will equal the first charater of the invalid sequence.
 300  * oLengh : will equal the length to @oEnd
 301  **/
 302 gboolean
 303 mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
 304 {
 305         gboolean retVal = TRUE;
 306         gboolean lastRet = TRUE;
 307         guchar* ptr = (guchar*) source;
 308         guchar *end = ptr + max_bytes;
 309         guchar* srcPtr;
 310         guint length;
 311         guchar a;
 312         *oLength = 0;
 313
 314         if (max_bytes < 1) {
 315                 if (oEnd)
 316                         *oEnd = (gchar*) ptr;
 317                 return FALSE;
 318         }
 319
 320         while (*ptr != 0) {
 321                 length = trailingBytesForUTF8 [*ptr] + 1;
 322                 srcPtr = (guchar*) ptr + length;
 323
 324                 /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
 325                    srcPtr points to the first byte after the current char.*/
 326                 if (srcPtr >= end) {
 327                         retVal = FALSE;
 328                         break;
 329                 }
 330                 switch (length) {
 331                 default: retVal = FALSE;
 332                 /* Everything else falls through when "TRUE"... */
 333                 case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 334                                 if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
 335                                 if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
 336                                         *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
 337                                         retVal = FALSE;
 338                                 }
 339                 case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 340                 case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
 341
 342                 switch (*ptr) {
 343                 /* no fall-through in this inner switch */
 344                 case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
 345                 case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
 346                 case 0xEF: if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
 347                                    if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE; break;
 348                 case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
 349                 case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
 350                 default:   if (a < (guchar) 0x80) retVal = FALSE;
 351                 }
 352
 353                 case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
 354                 }
 355                 if (*ptr > (guchar) 0xF4)
 356                         retVal = FALSE;
 357                 //If the string is invalid, set the end to the invalid byte.
 358                 if (!retVal && lastRet) {
 359                         if (oEnd != NULL)
 360                                 *oEnd = (gchar*) ptr;
 361                         lastRet = FALSE;
 362                 }
 363                 ptr += length;
 364                 (*oLength)++;
 365         }
 366         if (retVal && oEnd != NULL)
 367                 *oEnd = (gchar*) ptr;
 368         return retVal;
 369 }
 370