3 // Copyright (c) Microsoft Corporation. All rights reserved.
6 ////////////////////////////////////////////////////////////////////////////
8 // Class: CharacterInfo
10 // Purpose: This class implements a set of methods for retrieving
11 // character type information. Character type information is
12 // independent of culture and region.
14 // Date: August 12, 1998
16 ////////////////////////////////////////////////////////////////////////////
18 namespace System.Globalization {
20 //This class has only static members and therefore doesn't need to be serialized.
23 using System.Threading;
24 using System.Runtime.InteropServices;
25 using System.Runtime.CompilerServices;
26 using System.Runtime.Versioning;
27 using System.Reflection;
28 using System.Security;
29 using System.Diagnostics.Contracts;
32 public static class CharUnicodeInfo
34 //--------------------------------------------------------------------//
35 // Internal Information //
36 //--------------------------------------------------------------------//
39 // Native methods to access the Unicode category data tables in charinfo.nlp.
41 internal const char HIGH_SURROGATE_START = '\ud800';
42 internal const char HIGH_SURROGATE_END = '\udbff';
43 internal const char LOW_SURROGATE_START = '\udc00';
44 internal const char LOW_SURROGATE_END = '\udfff';
46 internal const int UNICODE_CATEGORY_OFFSET = 0;
47 internal const int BIDI_CATEGORY_OFFSET = 1;
49 static bool s_initialized = InitTable();
51 // The native pointer to the 12:4:4 index table of the Unicode cateogry data.
53 unsafe static ushort* s_pCategoryLevel1Index;
55 unsafe static byte* s_pCategoriesValue;
57 // The native pointer to the 12:4:4 index table of the Unicode numeric data.
58 // The value of this index table is an index into the real value table stored in s_pNumericValues.
60 unsafe static ushort* s_pNumericLevel1Index;
62 // The numeric value table, which is indexed by s_pNumericLevel1Index.
63 // Every item contains the value for numeric value.
64 // unsafe static double* s_pNumericValues;
65 // To get around the IA64 alignment issue. Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting
66 // at 4-byte boundary. This cause a alignment issue since double is 8-byte.
68 unsafe static byte* s_pNumericValues;
70 // The digit value table, which is indexed by s_pNumericLevel1Index. It shares the same indice as s_pNumericValues.
71 // Every item contains the value for decimal digit/digit value.
73 unsafe static DigitValues* s_pDigitValues;
75 internal const String UNICODE_INFO_FILE_NAME = "charinfo.nlp";
76 // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
77 internal const int UNICODE_PLANE01_START = 0x10000;
81 // This is the header for the native data table that we load from UNICODE_INFO_FILE_NAME.
83 // Excplicit layout is used here since a syntax like char[16] can not be used in sequential layout.
84 [StructLayout(LayoutKind.Explicit)]
85 internal unsafe struct UnicodeDataHeader {
87 internal char TableName; // WCHAR[16]
89 internal ushort version; // WORD[4]
91 internal uint OffsetToCategoriesIndex; // DWORD
93 internal uint OffsetToCategoriesValue; // DWORD
95 internal uint OffsetToNumbericIndex; // DWORD
97 internal uint OffsetToDigitValue; // DWORD
99 internal uint OffsetToNumbericValue; // DWORD
103 // NOTE: It's important to specify pack size here, since the size of the structure is 2 bytes. Otherwise,
104 // the default pack size will be 4.
106 [StructLayout(LayoutKind.Sequential, Pack=2)]
107 internal struct DigitValues {
108 internal sbyte decimalDigit;
109 internal sbyte digit;
113 //We need to allocate the underlying table that provides us with the information that we
114 //use. We allocate this once in the class initializer and then we don't need to worry
117 [System.Security.SecuritySafeCritical] // auto-generated
118 [ResourceExposure(ResourceScope.None)]
119 [ResourceConsumption(ResourceScope.Process, ResourceScope.Process)]
120 unsafe static bool InitTable() {
122 // Go to native side and get pointer to the native table
123 byte * pDataTable = GlobalizationAssembly.GetGlobalizationResourceBytePtr(typeof(CharUnicodeInfo).Assembly, UNICODE_INFO_FILE_NAME);
125 UnicodeDataHeader* mainHeader = (UnicodeDataHeader*)pDataTable;
127 // Set up the native pointer to different part of the tables.
128 s_pCategoryLevel1Index = (ushort*) (pDataTable + mainHeader->OffsetToCategoriesIndex);
129 s_pCategoriesValue = (byte*) (pDataTable + mainHeader->OffsetToCategoriesValue);
130 s_pNumericLevel1Index = (ushort*) (pDataTable + mainHeader->OffsetToNumbericIndex);
131 s_pNumericValues = (byte*) (pDataTable + mainHeader->OffsetToNumbericValue);
132 s_pDigitValues = (DigitValues*) (pDataTable + mainHeader->OffsetToDigitValue);
138 ////////////////////////////////////////////////////////////////////////
141 // Convert the BMP character or surrogate pointed by index to a UTF32 value.
142 // This is similar to Char.ConvertToUTF32, but the difference is that
143 // it does not throw exceptions when invalid surrogate characters are passed in.
145 // WARNING: since it doesn't throw an exception it CAN return a value
146 // in the surrogate range D800-DFFF, which are not legal unicode values.
148 ////////////////////////////////////////////////////////////////////////
150 internal static int InternalConvertToUtf32(String s, int index) {
151 Contract.Assert(s != null, "s != null");
152 Contract.Assert(index >= 0 && index < s.Length, "index < s.Length");
153 if (index < s.Length - 1) {
154 int temp1 = (int)s[index] - HIGH_SURROGATE_START;
155 if (temp1 >= 0 && temp1 <= 0x3ff) {
156 int temp2 = (int)s[index+1] - LOW_SURROGATE_START;
157 if (temp2 >= 0 && temp2 <= 0x3ff) {
158 // Convert the surrogate to UTF32 and get the result.
159 return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
163 return ((int)s[index]);
166 ////////////////////////////////////////////////////////////////////////
168 // Convert a character or a surrogate pair starting at index of string s
173 // index The starting index. It can point to a BMP character or
175 // len The length of the string.
176 // charLength [out] If the index points to a BMP char, charLength
177 // will be 1. If the index points to a surrogate pair,
178 // charLength will be 2.
180 // WARNING: since it doesn't throw an exception it CAN return a value
181 // in the surrogate range D800-DFFF, which are not legal unicode values.
186 ////////////////////////////////////////////////////////////////////////
188 internal static int InternalConvertToUtf32(String s, int index, out int charLength) {
189 Contract.Assert(s != null, "s != null");
190 Contract.Assert(s.Length > 0, "s.Length > 0");
191 Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
193 if (index < s.Length - 1) {
194 int temp1 = (int)s[index] - HIGH_SURROGATE_START;
195 if (temp1 >= 0 && temp1 <= 0x3ff) {
196 int temp2 = (int)s[index+1] - LOW_SURROGATE_START;
197 if (temp2 >= 0 && temp2 <= 0x3ff) {
198 // Convert the surrogate to UTF32 and get the result.
200 return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
204 return ((int)s[index]);
207 ////////////////////////////////////////////////////////////////////////
211 // Determines if the given character is a white space character.
213 ////////////////////////////////////////////////////////////////////////
215 internal static bool IsWhiteSpace(String s, int index)
217 Contract.Assert(s != null, "s!=null");
218 Contract.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
220 UnicodeCategory uc = GetUnicodeCategory(s, index);
221 // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
222 // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
224 case (UnicodeCategory.SpaceSeparator):
225 case (UnicodeCategory.LineSeparator):
226 case (UnicodeCategory.ParagraphSeparator):
233 internal static bool IsWhiteSpace(char c)
235 UnicodeCategory uc = GetUnicodeCategory(c);
236 // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
237 // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
239 case (UnicodeCategory.SpaceSeparator):
240 case (UnicodeCategory.LineSeparator):
241 case (UnicodeCategory.ParagraphSeparator):
249 // This is called by the public char and string, index versions
251 // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
253 [System.Security.SecuritySafeCritical] // auto-generated
254 internal unsafe static double InternalGetNumericValue(int ch) {
255 Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
256 // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
257 ushort index = s_pNumericLevel1Index[ch >> 8];
258 // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
259 // The offset is referred to an float item in m_pNumericFloatData.
260 // Note that & has the lower precedence than addition, so don't forget the parathesis.
261 index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
262 byte* pBytePtr = (byte*)&(s_pNumericLevel1Index[index]);
263 // Get the result from the 0 -3 bit of ch.
265 // To get around the IA64 alignment issue. Our double data is aligned in 8-byte boundary, but loader loads the embeded table starting
266 // at 4-byte boundary. This cause a alignment issue since double is 8-byte.
267 byte* pSourcePtr = &(s_pNumericValues[pBytePtr[(ch & 0x000f)] * sizeof(double)]);
268 if (((long)pSourcePtr % 8) != 0) {
269 // We are not aligned in 8-byte boundary. Do a copy.
271 byte* retPtr = (byte*)&ret;
272 Buffer.Memcpy(retPtr, pSourcePtr, sizeof(double));
275 return (((double*)s_pNumericValues)[pBytePtr[(ch & 0x000f)]]);
277 return (((double*)s_pNumericValues)[pBytePtr[(ch & 0x000f)]]);
282 // This is called by the public char and string, index versions
284 // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
286 [System.Security.SecuritySafeCritical] // auto-generated
287 internal unsafe static DigitValues* InternalGetDigitValues(int ch) {
288 Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
289 // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
290 ushort index = s_pNumericLevel1Index[ch >> 8];
291 // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
292 // The offset is referred to an float item in m_pNumericFloatData.
293 // Note that & has the lower precedence than addition, so don't forget the parathesis.
294 index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
295 byte* pBytePtr = (byte*)&(s_pNumericLevel1Index[index]);
296 // Get the result from the 0 -3 bit of ch.
297 return &(s_pDigitValues[pBytePtr[(ch & 0x000f)]]);
300 [System.Security.SecuritySafeCritical] // auto-generated
301 internal unsafe static sbyte InternalGetDecimalDigitValue(int ch) {
302 return (InternalGetDigitValues(ch)->decimalDigit);
305 [System.Security.SecuritySafeCritical] // auto-generated
306 internal unsafe static sbyte InternalGetDigitValue(int ch) {
307 return (InternalGetDigitValues(ch)->digit);
311 ////////////////////////////////////////////////////////////////////////
313 //Returns the numeric value associated with the character c. If the character is a fraction,
314 // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
317 // the numeric value for the specified Unicode character. If the character does not have a numeric value, the return value is -1.
319 // ch a Unicode character
321 // ArgumentNullException
322 // ArgumentOutOfRangeException
324 ////////////////////////////////////////////////////////////////////////
327 public static double GetNumericValue(char ch) {
328 return (InternalGetNumericValue(ch));
332 public static double GetNumericValue(String s, int index) {
334 throw new ArgumentNullException("s");
336 if (index < 0 || index >= s.Length) {
337 throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
339 Contract.EndContractBlock();
340 return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));
344 ////////////////////////////////////////////////////////////////////////
346 //Returns the decimal digit value associated with the character c.
348 // The value should be from 0 ~ 9.
349 // If the character does not have a numeric value, the return value is -1.
350 // From Unicode.org: Decimal Digits. Digits that can be used to form decimal-radix numbers.
352 // the decimal digit value for the specified Unicode character. If the character does not have a decimal digit value, the return value is -1.
354 // ch a Unicode character
356 // ArgumentNullException
357 // ArgumentOutOfRangeException
359 ////////////////////////////////////////////////////////////////////////
362 public static int GetDecimalDigitValue(char ch) {
363 return (InternalGetDecimalDigitValue(ch));
367 public static int GetDecimalDigitValue(String s, int index) {
369 throw new ArgumentNullException("s");
371 if (index < 0 || index >= s.Length) {
372 throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
374 Contract.EndContractBlock();
376 return (InternalGetDecimalDigitValue(InternalConvertToUtf32(s, index)));
379 ////////////////////////////////////////////////////////////////////////
381 //Action: Returns the digit value associated with the character c.
382 // If the character does not have a numeric value, the return value is -1.
383 // From Unicode.org: If the character represents a digit, not necessarily a decimal digit,
384 // the value is here. This covers digits which do not form decimal radix forms, such as the compatibility superscript digits.
386 // An example is: U+2460 IRCLED DIGIT ONE. This character has digit value 1, but does not have associcated decimal digit value.
389 // the digit value for the specified Unicode character. If the character does not have a digit value, the return value is -1.
391 // ch a Unicode character
393 // ArgumentNullException
394 // ArgumentOutOfRangeException
396 ////////////////////////////////////////////////////////////////////////
399 public static int GetDigitValue(char ch) {
400 return (InternalGetDigitValue(ch));
404 public static int GetDigitValue(String s, int index) {
406 throw new ArgumentNullException("s");
408 if (index < 0 || index >= s.Length) {
409 throw new ArgumentOutOfRangeException("index", Environment.GetResourceString("ArgumentOutOfRange_Index"));
411 Contract.EndContractBlock();
412 return (InternalGetDigitValue(InternalConvertToUtf32(s, index)));
415 public static UnicodeCategory GetUnicodeCategory(char ch)
417 return (InternalGetUnicodeCategory(ch)) ;
420 public static UnicodeCategory GetUnicodeCategory(String s, int index)
423 throw new ArgumentNullException("s");
424 if (((uint)index)>=((uint)s.Length)) {
425 throw new ArgumentOutOfRangeException("index");
427 Contract.EndContractBlock();
428 return InternalGetUnicodeCategory(s, index);
431 internal unsafe static UnicodeCategory InternalGetUnicodeCategory(int ch) {
432 return ((UnicodeCategory)InternalGetCategoryValue(ch, UNICODE_CATEGORY_OFFSET));
435 ////////////////////////////////////////////////////////////////////////
437 //Action: Returns the Unicode Category property for the character c.
439 // an value in UnicodeCategory enum
441 // ch a Unicode character
445 //Note that this API will return values for D800-DF00 surrogate halves.
447 ////////////////////////////////////////////////////////////////////////
449 [System.Security.SecuritySafeCritical] // auto-generated
450 internal unsafe static byte InternalGetCategoryValue(int ch, int offset) {
451 Contract.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
452 // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
453 ushort index = s_pCategoryLevel1Index[ch >> 8];
454 // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table.
455 // Note that & has the lower precedence than addition, so don't forget the parathesis.
456 index = s_pCategoryLevel1Index[index + ((ch >> 4) & 0x000f)];
457 byte* pBytePtr = (byte*)&(s_pCategoryLevel1Index[index]);
458 // Get the result from the 0 -3 bit of ch.
459 byte valueIndex = pBytePtr[(ch & 0x000f)];
460 byte uc = s_pCategoriesValue[valueIndex * 2 + offset];
462 // Make sure that OtherNotAssigned is the last category in UnicodeCategory.
463 // If that changes, change the following assertion as well.
465 //Contract.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category");
469 // internal static BidiCategory GetBidiCategory(char ch) {
470 // return ((BidiCategory)InternalGetCategoryValue(c, BIDI_CATEGORY_OFFSET));
473 internal static BidiCategory GetBidiCategory(String s, int index) {
475 throw new ArgumentNullException("s");
476 if (((uint)index)>=((uint)s.Length)) {
477 throw new ArgumentOutOfRangeException("index");
479 Contract.EndContractBlock();
480 return ((BidiCategory)InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
483 ////////////////////////////////////////////////////////////////////////
485 //Action: Returns the Unicode Category property for the character c.
487 // an value in UnicodeCategory enum
489 // value a Unicode String
490 // index Index for the specified string.
494 ////////////////////////////////////////////////////////////////////////
496 internal static UnicodeCategory InternalGetUnicodeCategory(String value, int index) {
497 Contract.Assert(value != null, "value can not be null");
498 Contract.Assert(index < value.Length, "index < value.Length");
500 return (InternalGetUnicodeCategory(InternalConvertToUtf32(value, index)));
503 ////////////////////////////////////////////////////////////////////////
505 // Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1.
506 // If the character is a valid surrogate pair, charLength will return 2.
508 ////////////////////////////////////////////////////////////////////////
510 internal static UnicodeCategory InternalGetUnicodeCategory(String str, int index, out int charLength) {
511 Contract.Assert(str != null, "str can not be null");
512 Contract.Assert(str.Length > 0, "str.Length > 0");;
513 Contract.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
515 return (InternalGetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
518 internal static bool IsCombiningCategory(UnicodeCategory uc) {
519 Contract.Assert(uc >= 0, "uc >= 0");
521 uc == UnicodeCategory.NonSpacingMark ||
522 uc == UnicodeCategory.SpacingCombiningMark ||
523 uc == UnicodeCategory.EnclosingMark