1 #define USE_MANAGED_RESOURCE
5 // MSCompatUnicodeTable.cs : Handles Windows-like sortket tables.
8 // Atsushi Enomoto <atsushi@ximian.com>
10 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
12 // Permission is hereby granted, free of charge, to any person obtaining
13 // a copy of this software and associated documentation files (the
14 // "Software"), to deal in the Software without restriction, including
15 // without limitation the rights to use, copy, modify, merge, publish,
16 // distribute, sublicense, and/or sell copies of the Software, and to
17 // permit persons to whom the Software is furnished to do so, subject to
18 // the following conditions:
20 // The above copyright notice and this permission notice shall be
21 // included in all copies or substantial portions of the Software.
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33 using System.Collections.Generic;
34 using System.Globalization;
35 using System.Reflection;
36 using System.Runtime.CompilerServices;
37 using System.Runtime.InteropServices;
39 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
41 namespace Mono.Globalization.Unicode
43 internal class TailoringInfo
45 public readonly int LCID;
46 public readonly int TailoringIndex;
47 public readonly int TailoringCount;
48 public readonly bool FrenchSort;
50 public TailoringInfo (int lcid, int tailoringIndex, int tailoringCount, bool frenchSort)
53 TailoringIndex = tailoringIndex;
54 TailoringCount = tailoringCount;
55 FrenchSort = frenchSort;
59 #region Tailoring support classes
60 // Possible mapping types are:
62 // - string to string (ReplacementMap)
63 // - string to SortKey (SortKeyMap)
64 // - diacritical byte to byte (DiacriticalMap)
66 // There could be mapping from string to sortkeys, but
67 // for now there is none as such.
69 internal class Contraction
72 public readonly char [] Source;
73 // only either of them is used.
74 public readonly string Replacement;
75 public readonly byte [] SortKey;
77 public Contraction (int index, char [] source,
78 string replacement, byte [] sortkey)
82 Replacement = replacement;
87 internal class ContractionComparer : IComparer<Contraction>
89 public static readonly ContractionComparer Instance =
90 new ContractionComparer ();
92 public int Compare (Contraction c1, Contraction c2)
94 char [] a1 = c1.Source;
95 char [] a2 = c2.Source;
96 int min = a1.Length > a2.Length ?
97 a2.Length : a1.Length;
98 for (int i = 0; i < min; i++)
100 return a1 [i] - a2 [i];
101 if (a1.Length != a2.Length)
102 return a1.Length - a2.Length;
103 // This makes the sorting stable, since we are using Array.Sort () which is
105 return c1.Index - c2.Index;
109 internal class Level2Map
114 public Level2Map (byte source, byte replace)
123 unsafe internal class MSCompatUnicodeTable
125 public static int MaxExpansionLength = 3;
127 static readonly byte* ignorableFlags;
128 static readonly byte* categories;
129 static readonly byte* level1;
130 static readonly byte* level2;
131 static readonly byte* level3;
132 // static readonly ushort* widthCompat;
134 static readonly char* tailoring;
136 static byte* cjkCHScategory;
137 static byte* cjkCHTcategory;
138 static byte* cjkJAcategory;
139 static byte* cjkKOcategory;
140 static byte* cjkCHSlv1;
141 static byte* cjkCHTlv1;
142 static byte* cjkJAlv1;
143 static byte* cjkKOlv1;
144 static byte* cjkKOlv2;
146 const int ResourceVersionSize = 1;
148 public static TailoringInfo GetTailoringInfo (int lcid)
150 for (int i = 0; i < tailoringInfos.Length; i++)
151 if (tailoringInfos [i].LCID == lcid)
152 return tailoringInfos [i];
156 unsafe public static void BuildTailoringTables (CultureInfo culture,
158 ref Contraction [] contractions,
159 ref Level2Map [] diacriticals)
161 // collect tailoring entries.
162 var cmaps = new List<Contraction> ();
163 var dmaps = new List<Level2Map> ();
165 fixed (char* tarr = tailoringArr){
166 int idx = t.TailoringIndex;
167 int end = idx + t.TailoringCount;
171 switch (tarr [idx]) {
172 case '\x1': // SortKeyMap
174 while (tarr [ss] != 0)
176 src = new char [ss - idx];
177 // Array.Copy (tarr, idx, src, 0, ss - idx);
178 Marshal.Copy ((IntPtr) (tarr + idx), src, 0, ss - idx);
179 byte [] sortkey = new byte [4];
180 for (int i = 0; i < 4; i++)
181 sortkey [i] = (byte) tarr [ss + 1 + i];
182 cmaps.Add (new Contraction (iindex,
183 src, null, sortkey));
188 case '\x2': // DiacriticalMap
189 dmaps.Add (new Level2Map (
190 (byte) tarr [idx + 1],
191 (byte) tarr [idx + 2]));
194 case '\x3': // ReplacementMap
196 while (tarr [ss] != 0)
198 src = new char [ss - idx];
199 // Array.Copy (tarr, idx, src, 0, ss - idx);
200 Marshal.Copy ((IntPtr) (tarr + idx), src, 0, ss - idx);
203 while (tarr [l] != 0)
205 string r = new string (tarr, ss, l - ss);
206 cmaps.Add (new Contraction (iindex,
212 throw new NotImplementedException (String.Format ("Mono INTERNAL ERROR (Should not happen): Collation tailoring table is broken for culture {0} ({1}) at 0x{2:X}", culture.LCID, culture.Name, idx));
216 cmaps.Sort (ContractionComparer.Instance);
217 dmaps.Sort ((a, b) => a.Source - b.Source);
218 contractions = cmaps.ToArray ();
219 diacriticals = dmaps.ToArray ();
222 static void SetCJKReferences (string name,
223 ref CodePointIndexer cjkIndexer,
224 ref byte* catTable, ref byte* lv1Table,
225 ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
227 // as a part of mscorlib.dll, this invocation is
228 // somewhat extraneous (pointers were already assigned).
232 catTable = cjkCHScategory;
233 lv1Table = cjkCHSlv1;
234 cjkIndexer = UUtil.CjkCHS;
237 catTable = cjkCHTcategory;
238 lv1Table = cjkCHTlv1;
239 cjkIndexer = UUtil.Cjk;
242 catTable = cjkJAcategory;
244 cjkIndexer = UUtil.Cjk;
247 catTable = cjkKOcategory;
250 cjkIndexer = UUtil.Cjk;
251 lv2Indexer = UUtil.Cjk;
256 public static byte Category (int cp)
258 return categories [UUtil.Category.ToIndex (cp)];
261 public static byte Level1 (int cp)
263 return level1 [UUtil.Level1.ToIndex (cp)];
266 public static byte Level2 (int cp)
268 return level2 [UUtil.Level2.ToIndex (cp)];
271 public static byte Level3 (int cp)
273 return level3 [UUtil.Level3.ToIndex (cp)];
276 public static bool IsSortable (string s)
278 foreach (char c in s)
284 public static bool IsSortable (int cp)
286 // LAMESPEC: they should strictly match with
287 // IsIgnorable() result, but sometimes it does not.
288 if (!IsIgnorable (cp))
296 return 0x180B <= cp && cp <= 0x180E ||
297 0x200C <= cp && cp <= 0x200F ||
298 0x202A <= cp && cp <= 0x202E ||
299 0x206A <= cp && cp <= 0x206F ||
300 0x200C <= cp && cp <= 0x200F ||
301 0xFFF9 <= cp && cp <= 0xFFFD;
304 public static bool IsIgnorable (int cp)
306 return IsIgnorable (cp, 1);
309 public static bool IsIgnorable (int cp, byte flag)
313 if ((flag & 1) != 0) {
314 UnicodeCategory uc = Char.GetUnicodeCategory ((char) cp);
315 // This check eliminates some extraneous code areas
316 if (uc == UnicodeCategory.OtherNotAssigned)
318 // Some characters in Surrogate area are ignored.
319 if (0xD880 <= cp && cp < 0xDB80)
322 int i = UUtil.Ignorable.ToIndex (cp);
323 return i >= 0 && (ignorableFlags [i] & flag) != 0;
326 // for (int i = 0; i <= char.MaxValue; i++)
327 // if (Char.GetUnicodeCategory ((char) i)
328 // == UnicodeCategory.OtherNotAssigned
329 // && ignorableFlags [i] != 7)
330 // Console.WriteLine ("{0:X04}", i);
332 public static bool IsIgnorableSymbol (int cp)
334 return IsIgnorable (cp, 2);
335 // int i = UUtil.Ignorable.ToIndex (cp);
336 // return i >= 0 && (ignorableFlags [i] & 0x2) != 0;
339 public static bool IsIgnorableNonSpacing (int cp)
341 return IsIgnorable (cp, 4);
342 // int i = UUtil.Ignorable.ToIndex (cp);
343 // return i >= 0 && (ignorableFlags [i] & 0x4) != 0;
345 // It could be implemented this way, but the above
347 // return categories [UUtil.Category.ToIndex (cp)] == 1;
350 public static int ToKanaTypeInsensitive (int i)
352 // Note that IgnoreKanaType does not treat half-width
353 // katakana as equivalent to full-width ones.
355 // Thus, it is so simple ;-)
356 return (0x3041 <= i && i <= 0x3094) ? i + 0x60 : i;
359 // Note that currently indexer optimizes this table a lot,
360 // which might have resulted in bugs.
361 public static int ToWidthCompat (int i)
367 return i - 0xFF00 + 0x20;
389 return 0xFFE9 - 0x2190 + i;
420 // Other Kana compat characters' width
421 // compatibility is considered in special weight.
426 if (i < 0x3164) { // Hangul compat
427 return i - 0x3130 + 0xFFA0;
431 // 0x32D0-0x32FE are Kana compat characters, whose
432 // width compatibility is considered in special weight.
436 #region Level 4 properties (Kana)
438 public static bool HasSpecialWeight (char c)
442 else if ('\uFF66' <= c && c < '\uFF9E')
444 else if ('\u3300' <= c)
446 else if (c < '\u309D')
447 return (c < '\u3099');
448 else if (c < '\u3100')
449 return c != '\u30FB';
450 else if (c < '\u32D0')
452 else if (c < '\u32FF')
457 // FIXME: it should be removed at some stage
458 // (will become unused).
459 public static byte GetJapaneseDashType (char c)
474 public static bool IsHalfWidthKana (char c)
476 return '\uFF66' <= c && c <= '\uFF9D';
479 public static bool IsHiragana (char c)
481 return '\u3041' <= c && c <= '\u3094';
484 public static bool IsJapaneseSmallLetter (char c)
486 if ('\uFF67' <= c && c <= '\uFF6F')
488 if ('\u3040' < c && c < '\u30FA') {
522 public static readonly bool IsReady = true; // always
524 static MSCompatUnicodeTable ()
526 throw new Exception ("This code should not be used");
528 fixed (byte* tmp = ignorableFlagsArr) {
529 ignorableFlags = tmp;
531 fixed (byte* tmp = categoriesArr) {
534 fixed (byte* tmp = level1Arr) {
537 fixed (byte* tmp = level2Arr) {
540 fixed (byte* tmp = level3Arr) {
543 // fixed (ushort* tmp = widthCompatArr) {
544 // widthCompat = tmp;
546 fixed (char* tmp = tailoringArr) {
549 fixed (byte* tmp = cjkCHSArr) {
550 cjkCHScategory = tmp;
551 cjkCHSlv1 = tmp + cjkCHSArrLength;
553 fixed (byte* tmp = cjkCHTArr) {
554 cjkCHTcategory = tmp;
555 cjkCHTlv1 = tmp + cjkCHTArrLength;
557 fixed (byte* tmp = cjkJAArr) {
559 cjkJAlv1 = tmp + cjkJAArrLength;
561 fixed (byte* tmp = cjkKOArr) {
563 cjkKOlv1 = tmp + cjkKOArrLength;
565 fixed (byte* tmp = cjkKOlv2Arr) {
570 public static void FillCJK (string name,
571 ref CodePointIndexer cjkIndexer,
572 ref byte* catTable, ref byte* lv1Table,
573 ref CodePointIndexer cjkLv2Indexer,
576 SetCJKReferences (name, ref cjkIndexer,
577 ref catTable, ref lv1Table,
578 ref cjkLv2Indexer, ref lv2Table);
583 static readonly char [] tailoringArr;
585 static readonly TailoringInfo [] tailoringInfos;
586 static object forLock = new object ();
587 public static readonly bool isReady;
589 public static bool IsReady {
590 get { return isReady; }
593 #if USE_MANAGED_RESOURCE
594 static IntPtr GetResource (string name)
598 return Assembly.GetExecutingAssembly ().GetManifestResourceInternal (name, out size, out module);
601 const int CollationTableIdxIgnorables = 0;
602 const int CollationTableIdxCategory = 1;
603 const int CollationTableIdxLevel1 = 2;
604 const int CollationTableIdxLevel2 = 3;
605 const int CollationTableIdxLevel3 = 4;
606 const int CollationTableIdxTailoringInfos = 5;
607 const int CollationTableIdxTailoringChars = 6;
608 const int CollationTableIdxCjkCHS = 7;
609 const int CollationTableIdxCjkCHT = 8;
610 const int CollationTableIdxCjkJA = 9;
611 const int CollationTableIdxCjkKO = 10;
612 const int CollationTableIdxCjkKOLv2 = 11;
614 [MethodImplAttribute (MethodImplOptions.InternalCall)]
615 static extern void load_collation_resource (int resource_index, byte** data);
617 static readonly string corlibPath = Assembly.GetExecutingAssembly ().Location;
619 const int CollationResourceCore = 0;
620 const int CollationResourceCJKCHS = 1;
621 const int CollationResourceCJKCHT = 2;
622 const int CollationResourceCJKJA = 3;
623 const int CollationResourceCJKKO = 4;
624 const int CollationResourceCJKKOlv2 = 5;
625 const int CollationResourceTailoring = 6;
627 [MethodImplAttribute (MethodImplOptions.InternalCall)]
628 static extern void load_collation_resource (string path, int resource_index, byte** data, int* size);
631 static uint UInt32FromBytePtr (byte* raw, uint idx)
633 return (uint) (raw [idx] + (raw [idx + 1] << 8)
634 + (raw [idx + 2] << 16) + (raw [idx + 3] << 24));
637 static MSCompatUnicodeTable ()
646 load_collation_resource (CollationTableIdxIgnorables, &raw);
647 ignorableFlags = raw;
648 load_collation_resource (CollationTableIdxCategory, &raw);
650 load_collation_resource (CollationTableIdxLevel1, &raw);
652 load_collation_resource (CollationTableIdxLevel2, &raw);
654 load_collation_resource (CollationTableIdxLevel3, &raw);
656 load_collation_resource (CollationTableIdxTailoringInfos, &raw);
657 tailor = (uint*) raw;
658 load_collation_resource (CollationTableIdxTailoringChars, &raw);
659 tailoring = (char*) raw;
663 uint count = tailor [idx++];
664 tailoringInfos = new TailoringInfo [count];
665 for (int i = 0; i < count; i++) {
666 int i1 = (int) tailor [idx++];
667 int i2 = (int) tailor [idx++];
668 int i3 = (int) tailor [idx++];
669 TailoringInfo ti = new TailoringInfo (
670 i1, i2, i3, tailor [idx++] != 0);
671 tailoringInfos [i] = ti;
682 #if USE_MANAGED_RESOURCE
683 IntPtr ptr = GetResource ("collation.core.bin");
684 if (ptr == IntPtr.Zero)
686 raw = (byte*) ((void*) ptr);
687 ptr = GetResource ("collation.tailoring.bin");
688 if (ptr == IntPtr.Zero)
690 tailor = (byte*) ((void*) ptr);
696 load_collation_resource (corlibPath, CollationResourceCore, &raw, &rawsize);
697 load_collation_resource (corlibPath, CollationResourceTailoring, &tailor, &trawsize);
698 load_collation_resource (corlibPath, CollationResourceTailoringChars, &tailorChars, &trawsize);
702 if (raw == null || tailor == null)
704 // check resource version
705 if (raw [0] != UUtil.ResourceVersion ||
706 tailor [0] != UUtil.ResourceVersion)
710 size = UInt32FromBytePtr (raw, idx);
712 ignorableFlags = raw + idx;
715 size = UInt32FromBytePtr (raw, idx);
717 categories = raw + idx;
720 size = UInt32FromBytePtr (raw, idx);
725 size = UInt32FromBytePtr (raw, idx);
730 size = UInt32FromBytePtr (raw, idx);
735 // size = UInt32FromBytePtr (raw, idx);
737 // widthCompat = (ushort*) (raw + idx);
743 uint count = UInt32FromBytePtr (tailor, idx);
745 tailoringInfos = new TailoringInfo [count];
746 for (int i = 0; i < count; i++) {
747 int i1 = (int) UInt32FromBytePtr (tailor, idx);
749 int i2 = (int) UInt32FromBytePtr (tailor, idx);
751 int i3 = (int) UInt32FromBytePtr (tailor, idx);
753 TailoringInfo ti = new TailoringInfo (
754 i1, i2, i3, tailor [idx++] != 0);
755 tailoringInfos [i] = ti;
759 count = UInt32FromBytePtr (tailor, idx);
762 tailoringArr = new char [count];
763 for (int i = 0; i < count; i++, idx += 2)
764 tailoringArr [i] = (char) (tailor [idx] + (tailor [idx + 1] << 8));
769 public static void FillCJK (string culture,
770 ref CodePointIndexer cjkIndexer,
773 ref CodePointIndexer lv2Indexer,
777 FillCJKCore (culture, ref cjkIndexer,
778 ref catTable, ref lv1Table,
779 ref lv2Indexer, ref lv2Table);
780 SetCJKReferences (culture, ref cjkIndexer,
781 ref catTable, ref lv1Table,
782 ref lv2Indexer, ref lv2Table);
786 static void FillCJKCore (string culture,
787 ref CodePointIndexer cjkIndexer,
788 ref byte* catTable, ref byte* lv1Table,
789 ref CodePointIndexer cjkLv2Indexer, ref byte* lv2Table)
798 catTable = cjkCHScategory;
799 lv1Table = cjkCHSlv1;
803 catTable = cjkCHTcategory;
804 lv1Table = cjkCHTlv1;
808 catTable = cjkJAcategory;
813 catTable = cjkKOcategory;
818 if (name == null || lv1Table != null)
823 #if USE_MANAGED_RESOURCE
825 String.Format ("collation.{0}.bin", name);
826 IntPtr ptr = GetResource (filename);
827 if (ptr == IntPtr.Zero)
829 raw = (byte*) ((void*) ptr);
830 idx += ResourceVersionSize;
834 case "zh-CHS": residx = CollationTableIdxCjkCHS; break;
835 case "zh-CHT": residx = CollationTableIdxCjkCHT; break;
836 case "ja": residx = CollationTableIdxCjkJA; break;
837 case "ko": residx = CollationTableIdxCjkKO; break;
841 load_collation_resource (residx, &raw);
846 case "zh-CHS": residx = CollationResourceCJKCHS; break;
847 case "zh-CHT": residx = CollationResourceCJKCHT; break;
848 case "ja": residx = CollationResourceCJKJA; break;
849 case "ko": residx = CollationResourceCJKKO; break;
853 load_collation_resource (corlibPath, residx, &raw, &size);
854 idx += ResourceVersionSize;
856 uint count = UInt32FromBytePtr (raw, idx);
858 catTable = (byte*) raw + idx;
859 lv1Table = (byte*) raw + idx + count;
863 cjkCHScategory = catTable;
864 cjkCHSlv1 = lv1Table;
867 cjkCHTcategory = catTable;
868 cjkCHTlv1 = lv1Table;
871 cjkJAcategory = catTable;
875 cjkKOcategory = catTable;
882 #if USE_MANAGED_RESOURCE
883 ptr = GetResource ("collation.cjkKOlv2.bin");
884 if (ptr == IntPtr.Zero)
886 raw = (byte*) ((void*) ptr);
887 idx = ResourceVersionSize + 4;
889 load_collation_resource (CollationTableIdxCjkKOLv2, &raw);
891 load_collation_resource (corlibPath, CollationResourceCJKKOlv2, &raw, &size);
892 idx = ResourceVersionSize + 4;
894 cjkKOlv2 = raw + idx;
902 // For "categories", 0 means no primary weight. 6 means
904 // For expanded character the value is never filled (i.e. 0).
905 // Those arrays will be split into blocks (<3400 and >F800)
906 // level 4 is computed.
908 // public static bool HasSpecialWeight (char c)
909 // { return level1 [(int) c] == 6; }
912 // autogenerated code or icall to fill array runs here