2005-06-23 Atsushi Enomoto <atsushi@ximian.com>
authorAtsushi Eno <atsushieno@gmail.com>
Thu, 23 Jun 2005 09:52:52 +0000 (09:52 -0000)
committerAtsushi Eno <atsushieno@gmail.com>
Thu, 23 Jun 2005 09:52:52 +0000 (09:52 -0000)
* SimpleCollator.cs : forgot to commit in the last checkin.
* create-mscompat-collation-table.cs : fixed arabic shift weight chars.
* TestDriver.cs : switch table dumper and collator testing.
* SortKey.cs : for now comment out internal indexes (not in use).

svn path=/trunk/mcs/; revision=46415

mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
mcs/class/corlib/Mono.Globalization.Unicode/SimpleCollator.cs
mcs/class/corlib/Mono.Globalization.Unicode/SortKey.cs
mcs/class/corlib/Mono.Globalization.Unicode/TestDriver.cs
mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs

index 1d59654a5e0b56d086d529724d7f246cb913ea64..a6033c0d7dec85e745093c4c2ed54855d7c39e5a 100644 (file)
@@ -1,3 +1,10 @@
+2005-06-23  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * SimpleCollator.cs : forgot to commit in the last checkin.
+       * create-mscompat-collation-table.cs : fixed arabic shift weight chars.
+       * TestDriver.cs : switch table dumper and collator testing.
+       * SortKey.cs : for now comment out internal indexes (not in use).
+
 2005-06-23  Atsushi Enomoto  <atsushi@ximian.com>
 
        * MSCompatUnicodeTable.template,
index 11a327a579e23b8a1407c7f4ba5af263acef96e4..2bd95826af56cbd6efcff9231bf6559a567e6ef0 100644 (file)
@@ -8,6 +8,7 @@ using System;
 using System.Globalization;
 
 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
+using Util = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
 
 namespace Mono.Globalization.Unicode
 {
@@ -22,14 +23,72 @@ namespace Mono.Globalization.Unicode
                bool ignoreKanaType;
                TextInfo textInfo; // for ToLower().
                bool frenchSort;
+               readonly ushort [] cjkTable;
+               readonly CodePointIndexer cjkIndexer;
+               readonly byte [] cjkLv2Table;
+               readonly CodePointIndexer cjkLv2Indexer;
 
                public SimpleCollator (CultureInfo culture)
                {
                        textInfo = culture.TextInfo;
+                       buf = new SortKeyBuffer (culture.LCID);
 
                        // FIXME: fill frenchSort from CultureInfo.
 
-                       buf = new SortKeyBuffer (culture.LCID);
+                       // custom CJK table support.
+                       switch (GetNeutralCulture (culture).Name) {
+                       case "zh-CHS":
+                               cjkTable = Uni.CjkCHS;
+                               cjkIndexer = Util.CjkCHS;
+                               break;
+                       case "zh-CHT":
+                               cjkTable = Uni.CjkCHT;
+                               cjkIndexer = Util.Cjk;
+                               break;
+                       case "ja":
+                               cjkTable = Uni.CjkJA;
+                               cjkIndexer = Util.Cjk;
+                               break;
+                       case "ko":
+                               cjkTable = Uni.CjkKO;
+                               cjkLv2Table = Uni.CjkKOLv2;
+                               cjkIndexer = Util.Cjk;
+                               cjkLv2Indexer = Util.Cjk;
+                               break;
+                       }
+               }
+
+               static CultureInfo GetNeutralCulture (CultureInfo info)
+               {
+                       CultureInfo ret = info;
+                       while (ret.Parent != null && ret.Parent.LCID != 127)
+                               ret = ret.Parent;
+                       return ret;
+               }
+
+               byte Category (int cp)
+               {
+                       if (cp < 0x3000 || cjkTable == null)
+                               return Uni.Categories (cp);
+                       ushort cjk = cjkTable [cjkIndexer.ToIndex (cp)];
+                       return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) :
+                               Uni.Categories (cp);
+               }
+
+               byte Level1 (int cp)
+               {
+                       if (cp < 0x3000 || cjkTable == null)
+                               return Uni.Level1 (cp);
+                       ushort cjk = cjkTable [cjkIndexer.ToIndex (cp)];
+                       return cjk != 0 ? (byte) (cjk & 0xFF) : Uni.Level1 (cp);
+               }
+
+               byte Level2 (int cp)
+               {
+                       if (cp < 0x3000 || cjkLv2Table == null)
+                               return Uni.Level2 (cp);
+                       byte cjk = cjkLv2Table [cjkLv2Indexer.ToIndex (cp)];
+                       return cjk != 0 ? cjk : Uni.Level2 (cp);
                }
 
                void SetOptions (CompareOptions options)
@@ -126,9 +185,9 @@ namespace Mono.Globalization.Unicode
 
                        if (Uni.HasSpecialWeight ((char) i))
                                buf.AppendKana (
-                                       Uni.Categories (i),
-                                       Uni.Level1 (i),
-                                       Uni.Level2 (i),
+                                       Category (i),
+                                       Level1 (i),
+                                       Level2 (i),
                                        Uni.Level3 (i),
                                        Uni.IsJapaneseSmallLetter ((char) i),
                                        Uni.GetJapaneseDashType ((char) i),
@@ -137,9 +196,9 @@ namespace Mono.Globalization.Unicode
                                        );
                        else
                                buf.AppendNormal (
-                                       Uni.Categories (i),
-                                       Uni.Level1 (i),
-                                       Uni.Level2 (i),
+                                       Category (i),
+                                       Level1 (i),
+                                       Level2 (i),
                                        Uni.Level3 (i));
                }
 
@@ -244,9 +303,9 @@ namespace Mono.Globalization.Unicode
                                if (expansion != null)
                                        return false;
 
-                               if (Uni.Categories (ci) != Uni.Categories (cj) ||
-                                       Uni.Level1 (ci) != Uni.Level1 (cj) ||
-                                       !ignoreNonSpace && Uni.Level2 (ci) != Uni.Level2 (cj) ||
+                               if (Category (ci) != Category (cj) ||
+                                       Level1 (ci) != Level1 (cj) ||
+                                       !ignoreNonSpace && Level2 (ci) != Level2 (cj) ||
                                        Uni.Level3 (ci) != Uni.Level3 (cj))
                                        return false;
                                if (!Uni.HasSpecialWeight ((char) ci))
@@ -315,9 +374,9 @@ namespace Mono.Globalization.Unicode
                                if (expansion != null)
                                        return false;
 
-                               if (Uni.Categories (ci) != Uni.Categories (cj) ||
-                                       Uni.Level1 (ci) != Uni.Level1 (cj) ||
-                                       !ignoreNonSpace && Uni.Level2 (ci) != Uni.Level2 (cj) ||
+                               if (Category (ci) != Category (cj) ||
+                                       Level1 (ci) != Level1 (cj) ||
+                                       !ignoreNonSpace && Level2 (ci) != Level2 (cj) ||
                                        Uni.Level3 (ci) != Uni.Level3 (cj))
                                        return false;
                                if (!Uni.HasSpecialWeight ((char) ci))
@@ -385,9 +444,9 @@ namespace Mono.Globalization.Unicode
                                if (s [idx] == target)
                                        return idx;
                                int si = FilterOptions ((int) s [idx]);
-                               if (Uni.Categories (si) != Uni.Categories (ti) ||
-                                       Uni.Level1 (si) != Uni.Level1 (ti) ||
-                                       !ignoreNonSpace && Uni.Level2 (si) != Uni.Level2 (ti) ||
+                               if (Category (si) != Category (ti) ||
+                                       Level1 (si) != Level1 (ti) ||
+                                       !ignoreNonSpace && Level2 (si) != Level2 (ti) ||
                                        Uni.Level3 (si) != Uni.Level3 (ti))
                                        continue;
                                if (!Uni.HasSpecialWeight ((char) si))
@@ -468,9 +527,9 @@ namespace Mono.Globalization.Unicode
                                if (s [idx] == target)
                                        return idx;
                                int si = FilterOptions ((int) s [idx]);
-                               if (Uni.Categories (si) != Uni.Categories (ti) ||
-                                       Uni.Level1 (si) != Uni.Level1 (ti) ||
-                                       !ignoreNonSpace && Uni.Level2 (si) != Uni.Level2 (ti) ||
+                               if (Category (si) != Category (ti) ||
+                                       Level1 (si) != Level1 (ti) ||
+                                       !ignoreNonSpace && Level2 (si) != Level2 (ti) ||
                                        Uni.Level3 (si) != Uni.Level3 (ti))
                                        continue;
                                if (!Uni.HasSpecialWeight ((char) si))
index f7d8279d1b892d344fcb819b15914d095671c3d1..db83622f82dda47d4ce196a96dc4794f4a3a3e15 100644 (file)
@@ -27,6 +27,7 @@ namespace System.Globalization
 
                readonly string source;
                readonly byte [] key;
+               /*
                readonly int lv1Length;
                readonly int lv2Length;
                readonly int lv3Length;
@@ -35,6 +36,7 @@ namespace System.Globalization
                readonly int katakanaLength;
                readonly int kanaWidthLength;
                readonly int identLength;
+               */
                readonly CompareOptions options;
                readonly int lcid;
 
@@ -56,6 +58,7 @@ namespace System.Globalization
                        this.source = source;
                        this.key = buffer;
                        this.options = opt;
+                       /*
                        this.lv1Length = lv1Length;
                        this.lv2Length = lv2Length;
                        this.lv3Length = lv3Length;
@@ -64,6 +67,7 @@ namespace System.Globalization
                        this.katakanaLength = katakanaLength;
                        this.kanaWidthLength = kanaWidthLength;
                        this.identLength = identLength;
+                       */
                }
 
                public string OriginalString {
@@ -73,7 +77,7 @@ namespace System.Globalization
                public byte [] KeyData {
                        get { return key; }
                }
-
+/*
                internal int Level1Length {
                        get { return lv1Length; }
                }
@@ -117,6 +121,7 @@ namespace System.Globalization
                internal int IdenticalLength {
                        get { return identLength; }
                }
+*/
 
                // copy from original SortKey.cs
                public override bool Equals (object value)
index d9cdc3b96efc94c9f060cbbcb4ddaf52dccfa99d..b6f9a7ac3365deb4eac1191a585e13530f0ee0de 100644 (file)
@@ -11,9 +11,12 @@ namespace Mono.Globalization.Unicode
 
                #region Testing bits
 
-               static void Main ()
+               static void Main (string [] args)
                {
-                       new TestDriver ().Run ();
+                       if (args.Length > 0 && args [0] == "--generate")
+                               new TestDriver ().Generate ();
+                       else
+                               new TestDriver ().Run ();
                }
 
                void Run ()
@@ -70,8 +73,10 @@ namespace Mono.Globalization.Unicode
                        LastIndexOf ("BBCBBC", "BC", CompareOptions.IgnoreCase);
                        LastIndexOf ("original", "rig", CompareOptions.None);
                        Console.WriteLine ("original".LastIndexOf ("rig"));
+               }
 
-/*
+               void Generate ()
+               {
                        // dump sortkey for every single character.
                        for (int i = 0; i <= char.MaxValue; i++) {
                                byte [] data = coll.GetSortKey (new string ((char) i, 1)).KeyData;
@@ -79,12 +84,10 @@ namespace Mono.Globalization.Unicode
                                        data [2] == 1 && data [3] == 1 && data [4] == 0)
                                        continue;
                                foreach (byte b in data)
-                                       Output.Write ("{0:X02} ", b);
-                               Output.WriteLine (" : {0:X04}, {1}",
+                                       Console.Write ("{0:X02} ", b);
+                               Console.WriteLine (" : {0:X04}, {1}",
                                        i, Char.GetUnicodeCategory ((char) i));
                        }
-                       Output.Close ();
-*/
                }
 
                void Compare (string s1, string s2)
index 171074096c17f3f309c1e95ca097f5bd639907bc..780ae05e135bc29b7841b3144b608d4e29f158d9 100644 (file)
@@ -1254,6 +1254,14 @@ sw.Close ();
                                        if (Char.IsNumber ((char) cp))
                                                diacritical [cp] = weight;
 
+                       // Modify some decomposition equivalence
+                       decompType [0xFE31] = 0;
+                       decompIndex [0xFE31] = 0;
+                       decompLength [0xFE31] = 0;
+                       decompType [0xFE32] = 0;
+                       decompIndex [0xFE32] = 0;
+                       decompLength [0xFE32] = 0;
+
                        // Korean parens numbers
                        for (int i = 0x3200; i <= 0x321C; i++)
                                diacritical [i] = 0xA;
@@ -1326,17 +1334,26 @@ sw.Close ();
 
                        // Hyphen/Dash : 06 81 - 06 90
                        for (int i = 0; i < char.MaxValue; i++) {
-                               if (Char.GetUnicodeCategory ((char) i)
-                                       == UnicodeCategory.DashPunctuation)
-//                                     AddCharMapGroupTail ((char) i, 6, 1);
-                                       AddCharMapGroup ((char) i, 6, 1, 0);
+                               if (!IsIgnorable (i) &&
+                                       Char.GetUnicodeCategory ((char) i) ==
+                                       UnicodeCategory.DashPunctuation) {
+                                       AddCharMapGroup2 ((char) i, 6, 1, 0);
+                                       if (i == 0x2011) {
+                                               // SPECIAL: add 2027 and 2043
+                                               // Maybe they are regarded the 
+                                               // same hyphens in "central"
+                                               // position.
+                                               AddCharMap ('\u2027', 6, 1);
+                                               AddCharMap ('\u2043', 6, 1);
+                                       }
+                               }
                        }
 
                        // Arabic variable weight chars 06 A0 -
                        fillIndex [6] = 0xA0;
                        // vowels
                        for (int i = 0x64B; i <= 0x650; i++)
-                               AddCharMapGroupTail ((char) i, 6, 1);
+                               AddArabicCharMap ((char) i);
                        // sukun
                        AddCharMapGroup ('\u0652', 6, 1, 0);
                        // shadda
@@ -2509,6 +2526,28 @@ sw.Close ();
                                        AddCharMap (vertical, category, updateCount, level2);
                }
 
+               private void AddArabicCharMap (char c)
+               {
+                       byte category = 6;
+                       byte updateCount = 1;
+                       byte level2 = 0;
+
+                       // itself
+                       AddCharMap (c, category, 0, level2);
+
+                       // Since nfkdMap is problematic to have two or more
+                       // NFKD to an identical character, here I iterate all.
+                       for (int c2 = 0; c2 < char.MaxValue; c2++) {
+                               if (decompLength [c2] == 0)
+                                       continue;
+                               int idx = decompIndex [c2] + decompLength [c2] - 1;
+                               if ((int) (decompValues [idx]) == (int) c)
+                                       AddCharMap ((char) c2, category,
+                                               0, level2);
+                       }
+                       fillIndex [category] += updateCount;
+               }
+
                char ToFullWidth (char c)
                {
                        return ToDecomposed (c, DecompositionFull, false);