* SimpleCollator.cs : forgot to commit in the last checkin.
* create-mscompat-collation-table.cs : fixed arabic shift weight chars.
* TestDriver.cs : switch table dumper and collator testing.
* SortKey.cs : for now comment out internal indexes (not in use).
svn path=/trunk/mcs/; revision=46415
+2005-06-23 Atsushi Enomoto <atsushi@ximian.com>
+
+ * SimpleCollator.cs : forgot to commit in the last checkin.
+ * create-mscompat-collation-table.cs : fixed arabic shift weight chars.
+ * TestDriver.cs : switch table dumper and collator testing.
+ * SortKey.cs : for now comment out internal indexes (not in use).
+
2005-06-23 Atsushi Enomoto <atsushi@ximian.com>
* MSCompatUnicodeTable.template,
using System.Globalization;
using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
+using Util = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
namespace Mono.Globalization.Unicode
{
bool ignoreKanaType;
TextInfo textInfo; // for ToLower().
bool frenchSort;
+ readonly ushort [] cjkTable;
+ readonly CodePointIndexer cjkIndexer;
+ readonly byte [] cjkLv2Table;
+ readonly CodePointIndexer cjkLv2Indexer;
public SimpleCollator (CultureInfo culture)
{
textInfo = culture.TextInfo;
+ buf = new SortKeyBuffer (culture.LCID);
// FIXME: fill frenchSort from CultureInfo.
- buf = new SortKeyBuffer (culture.LCID);
+ // custom CJK table support.
+ switch (GetNeutralCulture (culture).Name) {
+ case "zh-CHS":
+ cjkTable = Uni.CjkCHS;
+ cjkIndexer = Util.CjkCHS;
+ break;
+ case "zh-CHT":
+ cjkTable = Uni.CjkCHT;
+ cjkIndexer = Util.Cjk;
+ break;
+ case "ja":
+ cjkTable = Uni.CjkJA;
+ cjkIndexer = Util.Cjk;
+ break;
+ case "ko":
+ cjkTable = Uni.CjkKO;
+ cjkLv2Table = Uni.CjkKOLv2;
+ cjkIndexer = Util.Cjk;
+ cjkLv2Indexer = Util.Cjk;
+ break;
+ }
+ }
+
+ static CultureInfo GetNeutralCulture (CultureInfo info)
+ {
+ CultureInfo ret = info;
+ while (ret.Parent != null && ret.Parent.LCID != 127)
+ ret = ret.Parent;
+ return ret;
+ }
+
+ byte Category (int cp)
+ {
+ if (cp < 0x3000 || cjkTable == null)
+ return Uni.Categories (cp);
+ ushort cjk = cjkTable [cjkIndexer.ToIndex (cp)];
+ return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) :
+ Uni.Categories (cp);
+ }
+
+ byte Level1 (int cp)
+ {
+ if (cp < 0x3000 || cjkTable == null)
+ return Uni.Level1 (cp);
+ ushort cjk = cjkTable [cjkIndexer.ToIndex (cp)];
+ return cjk != 0 ? (byte) (cjk & 0xFF) : Uni.Level1 (cp);
+ }
+
+ byte Level2 (int cp)
+ {
+ if (cp < 0x3000 || cjkLv2Table == null)
+ return Uni.Level2 (cp);
+ byte cjk = cjkLv2Table [cjkLv2Indexer.ToIndex (cp)];
+ return cjk != 0 ? cjk : Uni.Level2 (cp);
}
void SetOptions (CompareOptions options)
if (Uni.HasSpecialWeight ((char) i))
buf.AppendKana (
- Uni.Categories (i),
- Uni.Level1 (i),
- Uni.Level2 (i),
+ Category (i),
+ Level1 (i),
+ Level2 (i),
Uni.Level3 (i),
Uni.IsJapaneseSmallLetter ((char) i),
Uni.GetJapaneseDashType ((char) i),
);
else
buf.AppendNormal (
- Uni.Categories (i),
- Uni.Level1 (i),
- Uni.Level2 (i),
+ Category (i),
+ Level1 (i),
+ Level2 (i),
Uni.Level3 (i));
}
if (expansion != null)
return false;
- if (Uni.Categories (ci) != Uni.Categories (cj) ||
- Uni.Level1 (ci) != Uni.Level1 (cj) ||
- !ignoreNonSpace && Uni.Level2 (ci) != Uni.Level2 (cj) ||
+ if (Category (ci) != Category (cj) ||
+ Level1 (ci) != Level1 (cj) ||
+ !ignoreNonSpace && Level2 (ci) != Level2 (cj) ||
Uni.Level3 (ci) != Uni.Level3 (cj))
return false;
if (!Uni.HasSpecialWeight ((char) ci))
if (expansion != null)
return false;
- if (Uni.Categories (ci) != Uni.Categories (cj) ||
- Uni.Level1 (ci) != Uni.Level1 (cj) ||
- !ignoreNonSpace && Uni.Level2 (ci) != Uni.Level2 (cj) ||
+ if (Category (ci) != Category (cj) ||
+ Level1 (ci) != Level1 (cj) ||
+ !ignoreNonSpace && Level2 (ci) != Level2 (cj) ||
Uni.Level3 (ci) != Uni.Level3 (cj))
return false;
if (!Uni.HasSpecialWeight ((char) ci))
if (s [idx] == target)
return idx;
int si = FilterOptions ((int) s [idx]);
- if (Uni.Categories (si) != Uni.Categories (ti) ||
- Uni.Level1 (si) != Uni.Level1 (ti) ||
- !ignoreNonSpace && Uni.Level2 (si) != Uni.Level2 (ti) ||
+ if (Category (si) != Category (ti) ||
+ Level1 (si) != Level1 (ti) ||
+ !ignoreNonSpace && Level2 (si) != Level2 (ti) ||
Uni.Level3 (si) != Uni.Level3 (ti))
continue;
if (!Uni.HasSpecialWeight ((char) si))
if (s [idx] == target)
return idx;
int si = FilterOptions ((int) s [idx]);
- if (Uni.Categories (si) != Uni.Categories (ti) ||
- Uni.Level1 (si) != Uni.Level1 (ti) ||
- !ignoreNonSpace && Uni.Level2 (si) != Uni.Level2 (ti) ||
+ if (Category (si) != Category (ti) ||
+ Level1 (si) != Level1 (ti) ||
+ !ignoreNonSpace && Level2 (si) != Level2 (ti) ||
Uni.Level3 (si) != Uni.Level3 (ti))
continue;
if (!Uni.HasSpecialWeight ((char) si))
readonly string source;
readonly byte [] key;
+ /*
readonly int lv1Length;
readonly int lv2Length;
readonly int lv3Length;
readonly int katakanaLength;
readonly int kanaWidthLength;
readonly int identLength;
+ */
readonly CompareOptions options;
readonly int lcid;
this.source = source;
this.key = buffer;
this.options = opt;
+ /*
this.lv1Length = lv1Length;
this.lv2Length = lv2Length;
this.lv3Length = lv3Length;
this.katakanaLength = katakanaLength;
this.kanaWidthLength = kanaWidthLength;
this.identLength = identLength;
+ */
}
public string OriginalString {
public byte [] KeyData {
get { return key; }
}
-
+/*
internal int Level1Length {
get { return lv1Length; }
}
internal int IdenticalLength {
get { return identLength; }
}
+*/
// copy from original SortKey.cs
public override bool Equals (object value)
#region Testing bits
- static void Main ()
+ static void Main (string [] args)
{
- new TestDriver ().Run ();
+ if (args.Length > 0 && args [0] == "--generate")
+ new TestDriver ().Generate ();
+ else
+ new TestDriver ().Run ();
}
void Run ()
LastIndexOf ("BBCBBC", "BC", CompareOptions.IgnoreCase);
LastIndexOf ("original", "rig", CompareOptions.None);
Console.WriteLine ("original".LastIndexOf ("rig"));
+ }
-/*
+ void Generate ()
+ {
// dump sortkey for every single character.
for (int i = 0; i <= char.MaxValue; i++) {
byte [] data = coll.GetSortKey (new string ((char) i, 1)).KeyData;
data [2] == 1 && data [3] == 1 && data [4] == 0)
continue;
foreach (byte b in data)
- Output.Write ("{0:X02} ", b);
- Output.WriteLine (" : {0:X04}, {1}",
+ Console.Write ("{0:X02} ", b);
+ Console.WriteLine (" : {0:X04}, {1}",
i, Char.GetUnicodeCategory ((char) i));
}
- Output.Close ();
-*/
}
void Compare (string s1, string s2)
if (Char.IsNumber ((char) cp))
diacritical [cp] = weight;
+ // Modify some decomposition equivalence
+ decompType [0xFE31] = 0;
+ decompIndex [0xFE31] = 0;
+ decompLength [0xFE31] = 0;
+ decompType [0xFE32] = 0;
+ decompIndex [0xFE32] = 0;
+ decompLength [0xFE32] = 0;
+
// Korean parens numbers
for (int i = 0x3200; i <= 0x321C; i++)
diacritical [i] = 0xA;
// Hyphen/Dash : 06 81 - 06 90
for (int i = 0; i < char.MaxValue; i++) {
- if (Char.GetUnicodeCategory ((char) i)
- == UnicodeCategory.DashPunctuation)
-// AddCharMapGroupTail ((char) i, 6, 1);
- AddCharMapGroup ((char) i, 6, 1, 0);
+ if (!IsIgnorable (i) &&
+ Char.GetUnicodeCategory ((char) i) ==
+ UnicodeCategory.DashPunctuation) {
+ AddCharMapGroup2 ((char) i, 6, 1, 0);
+ if (i == 0x2011) {
+ // SPECIAL: add 2027 and 2043
+ // Maybe they are regarded the
+ // same hyphens in "central"
+ // position.
+ AddCharMap ('\u2027', 6, 1);
+ AddCharMap ('\u2043', 6, 1);
+ }
+ }
}
// Arabic variable weight chars 06 A0 -
fillIndex [6] = 0xA0;
// vowels
for (int i = 0x64B; i <= 0x650; i++)
- AddCharMapGroupTail ((char) i, 6, 1);
+ AddArabicCharMap ((char) i);
// sukun
AddCharMapGroup ('\u0652', 6, 1, 0);
// shadda
AddCharMap (vertical, category, updateCount, level2);
}
+ private void AddArabicCharMap (char c)
+ {
+ byte category = 6;
+ byte updateCount = 1;
+ byte level2 = 0;
+
+ // itself
+ AddCharMap (c, category, 0, level2);
+
+ // Since nfkdMap is problematic to have two or more
+ // NFKD to an identical character, here I iterate all.
+ for (int c2 = 0; c2 < char.MaxValue; c2++) {
+ if (decompLength [c2] == 0)
+ continue;
+ int idx = decompIndex [c2] + decompLength [c2] - 1;
+ if ((int) (decompValues [idx]) == (int) c)
+ AddCharMap ((char) c2, category,
+ 0, level2);
+ }
+ fillIndex [category] += updateCount;
+ }
+
char ToFullWidth (char c)
{
return ToDecomposed (c, DecompositionFull, false);