// // // There are two kind of sort keys : which are computed and which are laid out // as an indexed array. Computed sort keys are: // // - Surrogate // - PrivateUse // // Also, for composite characters it should prepare different index table. // // Though it is possible to "compute" level 3 weights, they are still dumped // to an array to avoid execution cost. // // // * sortkey getter signature // // int GetSortKey (string s, int index, SortKeyBuffer buf) // Stores sort key for corresponding character element into buf and // returns the length of the consumed _source_ character element in s. // // * character length to consume // // If there are characters whose primary weight is 0, they are consumed // and considered as a part of the character element. // #define Binary using System; using System.IO; using System.Collections; using System.Globalization; using System.Text; using System.Xml; namespace Mono.Globalization.Unicode { internal class MSCompatSortKeyTableGenerator { public static void Main (string [] args) { new MSCompatSortKeyTableGenerator ().Run (args); } const int DecompositionWide = 1; // fixed const int DecompositionSub = 2; // fixed const int DecompositionSmall = 3; const int DecompositionIsolated = 4; const int DecompositionInitial = 5; const int DecompositionFinal = 6; const int DecompositionMedial = 7; const int DecompositionNoBreak = 8; const int DecompositionVertical = 9; const int DecompositionFraction = 0xA; const int DecompositionFont = 0xB; const int DecompositionSuper = 0xC; // fixed const int DecompositionFull = 0xE; const int DecompositionNarrow = 0xD; const int DecompositionCircle = 0xF; const int DecompositionSquare = 0x10; const int DecompositionCompat = 0x11; const int DecompositionCanonical = 0x12; TextWriter Result = Console.Out; byte [] fillIndex = new byte [256]; // by category CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1]; char [] specialIgnore = new char [] { '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD', '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F' }; // FIXME: need more love (as always) char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\u0292', '\u01BE', '\u0298'}; byte [] alphaWeights = new byte [] { 2, 9, 0xA, 0x1A, 0x21, 0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70, 0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4, 0xA6, 0xA7, 0xA9, 0xAA, 0xB3, 0xB4}; bool [] isSmallCapital = new bool [char.MaxValue + 1]; bool [] isUppercase = new bool [char.MaxValue + 1]; byte [] decompType = new byte [char.MaxValue + 1]; int [] decompIndex = new int [char.MaxValue + 1]; int [] decompLength = new int [char.MaxValue + 1]; int [] decompValues; decimal [] decimalValue = new decimal [char.MaxValue + 1]; byte [] diacritical = new byte [char.MaxValue + 1]; string [] diacritics = new string [] { // LATIN, CYRILLIC etc. "UPTURN", "DOUBLE-STRUCK", "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS", "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;", "WITH ACUTE;", "WITH GRAVE;", // "WITH DOT ABOVE;", " MIDDLE DOT;", "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;", "WITH DIALYTIKA;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", "WITH OGONEK;", "WITH CEDILLA;", // " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;", "WITH STROKE;", " CIRCUMFLEX AND ACUTE;", "STROKE OVERLAY", " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;", " DIAERESIS AND GRAVE;", " BREVE AND ACUTE;", " CARON AND DOT ABOVE;", " BREVE AND GRAVE;", " MACRON AND ACUTE;", " MACRON AND GRAVE;", // " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE", " RING ABOVE AND ACUTE", " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS", " CIRCUMFLEX AND TILDE", " TILDE AND DIAERESIS", " STROKE AND ACUTE", " BREVE AND TILDE", " CEDILLA AND BREVE", " OGONEK AND MACRON", // "WITH OVERLINE", "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", " DOUBLE GRAVE", " INVERTED BREVE", "ROMAN NUMERAL", " PRECEDED BY APOSTROPHE", "WITH HORN;", " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE", " PALATAL HOOK", " DOT BELOW;", " RETROFLEX;", "DIAERESIS BELOW", " RING BELOW", // " CIRCUMFLEX BELOW", "HORN AND ACUTE", " BREVE BELOW;", " HORN AND GRAVE", " TILDE BELOW", " TOPBAR", " DOT BELOW AND DOT ABOVE", " RIGHT HALF RING", " HORN AND TILDE", " CIRCUMFLEX AND DOT BELOW", " BREVE AND DOT BELOW", " DOT BELOW AND MACRON", " TONE TWO", " HORN AND HOOK ABOVE", " HORN AND DOT", // CIRCLED, PARENTHESIZED and so on "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA", "CIRCLED SANS-SERIF", "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN", }; byte [] diacriticWeights = new byte [] { // LATIN. 3, 3, 5, 5, 5, 0xE, 0xF, 0xE, 0xF, // 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C, // 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x20, 0x21, 0x22, 0x22, 0x23, 0x24, // 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30, // 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A, // 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68, 0x69, 0x69, 0x6A, 0x6D, 0x6E, 0x87, 0x95, 0xAA, // CIRCLED, PARENTHESIZED and so on. 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3 }; int [] numberSecondaryWeightBounds = new int [] { 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970, 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70, 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00, 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70, 0xE50, 0xE60, 0xED0, 0xEE0 }; char [] orderedGurmukhi; char [] orderedGujarati; char [] orderedGeorgian; char [] orderedThaana; static readonly char [] orderedTamilConsonants = new char [] { // based on traditional Tamil consonants, except for // Grantha (where Microsoft breaks traditionalism). // http://www.angelfire.com/empire/thamizh/padanGaL '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7', '\u0BB9'}; // cp -> character name (only for some characters) ArrayList sortableCharNames = new ArrayList (); // cp -> arrow value (int) ArrayList arrowValues = new ArrayList (); // cp -> box value (int) ArrayList boxValues = new ArrayList (); // cp -> level1 value Hashtable arabicLetterPrimaryValues = new Hashtable (); // letterName -> cp Hashtable arabicNameMap = new Hashtable (); // cp -> Hashtable [decompType] -> cp Hashtable nfkdMap = new Hashtable (); // Latin letter -> ArrayList [int] Hashtable latinMap = new Hashtable (); ArrayList jisJapanese = new ArrayList (); ArrayList nonJisJapanese = new ArrayList (); ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00]; ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100]; ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00]; ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00]; byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00]; byte [] ignorableFlags = new byte [char.MaxValue + 1]; static double [] unicodeAge = new double [char.MaxValue + 1]; ArrayList tailorings = new ArrayList (); void Run (string [] args) { string dirname = args.Length == 0 ? "downloaded" : args [0]; ParseSources (dirname); Console.Error.WriteLine ("parse done."); ModifyParsedValues (); GenerateCore (); Console.Error.WriteLine ("generation done."); Serialize (); Console.Error.WriteLine ("serialization done."); /* StreamWriter sw = new StreamWriter ("agelog.txt"); for (int i = 0; i < char.MaxValue; i++) { bool shouldBe = false; switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned: shouldBe = true; break; } if (unicodeAge [i] >= 3.1) shouldBe = true; //if (IsIgnorable (i) != shouldBe) sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' '); } sw.Close (); */ } byte [] CompressArray (byte [] source, CodePointIndexer i) { return (byte []) CodePointIndexer.CompressArray ( source, typeof (byte), i); } ushort [] CompressArray (ushort [] source, CodePointIndexer i) { return (ushort []) CodePointIndexer.CompressArray ( source, typeof (ushort), i); } void Serialize () { // Tailorings SerializeTailorings (); byte [] categories = new byte [map.Length]; byte [] level1 = new byte [map.Length]; byte [] level2 = new byte [map.Length]; byte [] level3 = new byte [map.Length]; ushort [] widthCompat = new ushort [map.Length]; for (int i = 0; i < map.Length; i++) { categories [i] = map [i].Category; level1 [i] = map [i].Level1; level2 [i] = map [i].Level2; level3 [i] = ComputeLevel3Weight ((char) i); // For Japanese Half-width characters, don't // map widthCompat. It is IgnoreKanaType that // handles those width differences. if (0xFF6D <= i && i <= 0xFF9D) continue; switch (decompType [i]) { case DecompositionNarrow: case DecompositionWide: case DecompositionSuper: case DecompositionSub: // they are always 1 char widthCompat [i] = (ushort) decompValues [decompIndex [i]]; break; } } // compress ignorableFlags = CompressArray (ignorableFlags, MSCompatUnicodeTableUtil.Ignorable); categories = CompressArray (categories, MSCompatUnicodeTableUtil.Category); level1 = CompressArray (level1, MSCompatUnicodeTableUtil.Level1); level2 = CompressArray (level2, MSCompatUnicodeTableUtil.Level2); level3 = CompressArray (level3, MSCompatUnicodeTableUtil.Level3); widthCompat = (ushort []) CodePointIndexer.CompressArray ( widthCompat, typeof (ushort), MSCompatUnicodeTableUtil.WidthCompat); cjkCHS = CompressArray (cjkCHS, MSCompatUnicodeTableUtil.CjkCHS); cjkCHT = CompressArray (cjkCHT, MSCompatUnicodeTableUtil.Cjk); cjkJA = CompressArray (cjkJA, MSCompatUnicodeTableUtil.Cjk); cjkKO = CompressArray (cjkKO, MSCompatUnicodeTableUtil.Cjk); cjkKOlv2 = CompressArray (cjkKOlv2, MSCompatUnicodeTableUtil.Cjk); // Ignorables Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {"); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); binary.Write (ignorableFlags.Length); #endif for (int i = 0; i < ignorableFlags.Length; i++) { byte value = ignorableFlags [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Primary category Result.WriteLine ("internal static readonly byte [] categories = new byte [] {"); #if Binary binary.Write (categories.Length); #endif for (int i = 0; i < categories.Length; i++) { byte value = categories [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Primary weight value Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {"); #if Binary binary.Write (level1.Length); #endif for (int i = 0; i < level1.Length; i++) { byte value = level1 [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Secondary weight Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {"); #if Binary binary.Write (level2.Length); #endif for (int i = 0; i < level2.Length; i++) { byte value = level2 [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Thirtiary weight Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {"); #if Binary binary.Write (level3.Length); #endif for (int i = 0; i < level3.Length; i++) { byte value = level3 [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Width insensitivity mappings // (for now it is more lightweight than dumping the // entire NFKD table). Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {"); #if Binary binary.Write (widthCompat.Length); #endif for (int i = 0; i < widthCompat.Length; i++) { ushort value = widthCompat [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); #if Binary using (FileStream fs = File.Create ("../collation.core.bin")) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } #endif // CJK SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue); SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0); SerializeCJK ("cjkJA", cjkJA, 0x9FB0); SerializeCJK ("cjkKO", cjkKO, 0x9FB0); SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0); } void SerializeCJK (string name, ushort [] cjk, int max) { int offset = 0;//char.MaxValue - cjk.Length; Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); binary.Write (cjk.Length); #endif for (int i = 0; i < cjk.Length; i++) { if (i + offset == max) break; ushort value = cjk [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X04},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF + offset); } Result.WriteLine ("};"); Result.WriteLine (); #if Binary using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } #endif } void SerializeCJK (string name, byte [] cjk, int max) { int offset = 0;//char.MaxValue - cjk.Length; Result.WriteLine ("static byte [] {0} = new byte [] {{", name); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); #endif for (int i = 0; i < cjk.Length; i++) { if (i + offset == max) break; byte value = cjk [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); #if Binary binary.Write (value); #endif if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF + offset); } Result.WriteLine ("};"); Result.WriteLine (); #if Binary using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } #endif } void SerializeTailorings () { Hashtable indexes = new Hashtable (); Hashtable counts = new Hashtable (); Result.WriteLine ("static char [] tailorings = new char [] {"); int count = 0; #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); #endif foreach (Tailoring t in tailorings) { if (t.Alias != 0) continue; Result.Write ("/*{0}*/", t.LCID); indexes.Add (t.LCID, count); char [] values = t.ItemToCharArray (); counts.Add (t.LCID, values.Length); foreach (char c in values) { Result.Write ("'\\x{0:X}', ", (int) c); if (++count % 16 == 0) Result.WriteLine (" // {0:X04}", count - 16); #if Binary binary.Write ((ushort) c); #endif } } Result.WriteLine ("};"); Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); #if Binary byte [] rawdata = ms.ToArray (); ms = new MemoryStream (); binary = new BinaryWriter (ms); binary.Write (tailorings.Count); #endif foreach (Tailoring t in tailorings) { int target = t.Alias != 0 ? t.Alias : t.LCID; if (!indexes.ContainsKey (target)) { throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias)); continue; } int idx = (int) indexes [target]; int cnt = (int) counts [target]; bool french = t.FrenchSort; if (t.Alias != 0) foreach (Tailoring t2 in tailorings) if (t2.LCID == t.LCID) french = t2.FrenchSort; Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); #if Binary binary.Write (t.LCID); binary.Write (idx); binary.Write (cnt); binary.Write (french); #endif } Result.WriteLine ("};"); #if Binary binary.Write ((byte) 0xFF); binary.Write ((byte) 0xFF); binary.Write (rawdata.Length / 2); binary.Write (rawdata, 0, rawdata.Length); using (FileStream fs = File.Create ("../collation.tailoring.bin")) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } #endif } #region Parse void ParseSources (string dirname) { string unidata = dirname + "/UnicodeData.txt"; string derivedCoreProps = dirname + "/DerivedCoreProperties.txt"; string scripts = dirname + "/Scripts.txt"; string cp932 = dirname + "/CP932.TXT"; string derivedAge = dirname + "/DerivedAge.txt"; string chXML = dirname + "/common/collation/zh.xml"; string jaXML = dirname + "/common/collation/ja.xml"; string koXML = dirname + "/common/collation/ko.xml"; ParseDerivedAge (derivedAge); FillIgnorables (); ParseJISOrder (cp932); // in prior to ParseUnidata() ParseUnidata (unidata); ModifyUnidata (); ParseDerivedCoreProperties (derivedCoreProps); ParseScripts (scripts); ParseCJK (chXML, jaXML, koXML); ParseTailorings ("mono-tailoring-source.txt"); } void ParseTailorings (string filename) { Tailoring t = null; int line = 0; using (StreamReader sr = new StreamReader (filename)) { try { while (sr.Peek () >= 0) { line++; ProcessTailoringLine (ref t, sr.ReadLine ().Trim ()); } } catch (Exception) { Console.Error.WriteLine ("ERROR at line {0}", line); throw; } } } // For now this is enough. string ParseTailoringSourceValue (string s) { StringBuilder sb = new StringBuilder (); for (int i = 0; i < s.Length; i++) { if (s.StartsWith ("\\u")) { sb.Append ((char) int.Parse ( s.Substring (2, 4), NumberStyles.HexNumber), 1); i += 5; } else sb.Append (s [i]); } return sb.ToString (); } void ProcessTailoringLine (ref Tailoring t, string s) { int idx = s.IndexOf ('#'); if (idx > 0) s = s.Substring (0, idx).Trim (); if (s.Length == 0 || s [0] == '#') return; if (s [0] == '@') { idx = s.IndexOf ('='); if (idx > 0) t = new Tailoring ( int.Parse (s.Substring (1, idx - 1)), int.Parse (s.Substring (idx + 1))); else t = new Tailoring (int.Parse (s.Substring (1))); tailorings.Add (t); return; } if (s.StartsWith ("*FrenchSort")) { t.FrenchSort = true; return; } string d = "*Diacritical"; if (s.StartsWith (d)) { idx = s.IndexOf ("->"); t.AddDiacriticalMap ( byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (), NumberStyles.HexNumber), byte.Parse (s.Substring (idx + 2).Trim (), NumberStyles.HexNumber)); return; } idx = s.IndexOf (':'); if (idx > 0) { string source = s.Substring (0, idx).Trim (); string [] l = s.Substring (idx + 1).Trim ().Split (' '); byte [] b = new byte [4]; for (int i = 0; i < 4; i++) { if (l [i] == "*") b [i] = 0; else b [i] = byte.Parse (l [i], NumberStyles.HexNumber); } t.AddSortKeyMap (ParseTailoringSourceValue (source), b); } idx = s.IndexOf ('='); if (idx > 0) t.AddReplacementMap ( ParseTailoringSourceValue ( s.Substring (0, idx).Trim ()), ParseTailoringSourceValue ( s.Substring (idx + 1).Trim ())); } void ParseDerivedAge (string filename) { using (StreamReader file = new StreamReader (filename)) { while (file.Peek () >= 0) { string s = file.ReadLine (); int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) continue; string cpspec = s.Substring (0, idx); idx = cpspec.IndexOf (".."); NumberStyles nf = NumberStyles.HexNumber | NumberStyles.AllowTrailingWhite; int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf); int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf); string value = s.Substring (cpspec.Length + 1).Trim (); // FIXME: use index if (cp > char.MaxValue) continue; double v = double.Parse (value); for (int i = cp; i <= cpEnd; i++) unicodeAge [i] = v; } } unicodeAge [0] = double.MaxValue; // never be supported } void ParseUnidata (string filename) { ArrayList decompValues = new ArrayList (); using (StreamReader unidata = new StreamReader (filename)) { for (int line = 1; unidata.Peek () >= 0; line++) { try { ProcessUnidataLine (unidata.ReadLine (), decompValues); } catch (Exception) { Console.Error.WriteLine ("**** At line " + line); throw; } } } this.decompValues = (int []) decompValues.ToArray (typeof (int)); } char previousLatinTarget = char.MinValue; byte [] diacriticalOffset = new byte ['Z' - 'A' + 1]; void ProcessUnidataLine (string s, ArrayList decompValues) { int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) return; int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber); string [] values = s.Substring (idx + 1).Split (';'); // FIXME: use index if (cp > char.MaxValue) return; if (IsIgnorable (cp)) return; string name = values [0]; // SPECIAL CASE: rename some characters for diacritical // remapping. FIXME: why are they different? // FIXME: it's still not working. if (cp == 0x018B || cp == 0x018C) name = name.Replace ("TOPBAR", "STROKE"); // isSmallCapital if (s.IndexOf ("SMALL CAPITAL") > 0) isSmallCapital [cp] = true; // latin mapping by character name if (s.IndexOf ("LATIN") >= 0) { int lidx = s.IndexOf ("LETTER DOTLESS "); int offset = lidx + 15; if (lidx < 0) { lidx = s.IndexOf ("LETTER TURNED "); offset = lidx + 14; } if (lidx < 0) { lidx = s.IndexOf ("LETTER CAPITAL "); offset = lidx + 15; } if (lidx < 0) { lidx = s.IndexOf ("LETTER SCRIPT "); offset = lidx + 14; } if (lidx < 0) { lidx = s.IndexOf ("LETTER "); offset = lidx + 7; } char c = lidx > 0 ? s [offset] : char.MinValue; char n = s [offset + 1]; char target = char.MinValue; if ('A' <= c && c <= 'Z' && (n == ' ') || n == ';') { target = c; // FIXME: After 'Z', I cannot reset this state. previousLatinTarget = c == 'Z' ? char.MinValue : c; } if (s.Substring (offset).StartsWith ("ALPHA")) target = 'A'; else if (s.Substring (offset).StartsWith ("TONE SIX")) target = 'B'; else if (s.Substring (offset).StartsWith ("OPEN O")) target = 'C'; else if (s.Substring (offset).StartsWith ("SCHWA")) target = 'E'; else if (s.Substring (offset).StartsWith ("ENG")) target = 'N'; else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3 target = 'O'; else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3 target = 'R'; else if (s.Substring (offset).StartsWith ("TONE TWO")) target = 'S'; else if (s.Substring (offset).StartsWith ("ESH")) target = 'S'; // For remaining IPA chars, direct mapping is // much faster. switch (cp) { case 0x0299: target = 'B'; break; case 0x029A: target = 'E'; break; case 0x029B: target = 'G'; break; case 0x029C: target = 'H'; break; case 0x029D: target = 'J'; break; case 0x029E: target = 'K'; break; case 0x029F: target = 'L'; break; case 0x02A0: target = 'Q'; break; case 0x02A7: target = 'T'; break; case 0x02A8: target = 'T'; break; } if (target == char.MinValue) target = previousLatinTarget; if (target != char.MinValue) { ArrayList entry = (ArrayList) latinMap [target]; if (entry == null) { entry = new ArrayList (); latinMap [target] = entry; } entry.Add (cp); // FIXME: This secondary weight is hack. // They are here because they must not // be identical to the corresponding // ASCII latins. if (c != target && diacritical [cp] == 0) { diacriticalOffset [c - 'A']++; diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C); } } } // Arrow names if (0x2000 <= cp && cp < 0x3000) { int value = 0; // SPECIAL CASES. FIXME: why? switch (cp) { case 0x21C5: value = -1; break; // E2 case 0x261D: value = 1; break; case 0x27A6: value = 3; break; case 0x21B0: value = 7; break; case 0x21B1: value = 3; break; case 0x21B2: value = 7; break; case 0x21B4: value = 5; break; case 0x21B5: value = 7; break; case 0x21B9: value = -1; break; // E1 case 0x21CF: value = 7; break; case 0x21D0: value = 3; break; } string [] arrowTargets = new string [] { "", "UPWARDS", "NORTH EAST", "RIGHTWARDS", "SOUTH EAST", "DOWNWARDS", "SOUTH WEST", "LEFTWARDS", "NORTH WEST", "LEFT RIGHT", "UP DOWN", }; if (s.IndexOf ("RIGHTWARDS") >= 0 && s.IndexOf ("LEFTWARDS") >= 0) value = 0xE1 - 0xD8; else if (s.IndexOf ("UPWARDS") >= 0 && s.IndexOf ("DOWNWARDS") >= 0) value = 0xE2 - 0xD8; else if (s.IndexOf ("ARROW") >= 0 && s.IndexOf ("COMBINING") < 0 && s.IndexOf ("CLOCKWISE") >= 0) value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8; if (value == 0) for (int i = 1; value == 0 && i < arrowTargets.Length; i++) if (s.IndexOf (arrowTargets [i]) > 0 && s.IndexOf ("BARB " + arrowTargets [i]) < 0 && s.IndexOf (" OVER") < 0 ) value = i; if (value > 0) arrowValues.Add (new DictionaryEntry ( cp, value)); } // Box names if (0x2500 <= cp && cp < 0x2600) { int value = int.MinValue; // flags: // up:1 down:2 right:4 left:8 vert:16 horiz:32 // [h,rl] [r] [l] // [v,ud] [u] [d] // [dr] [dl] [ur] [ul] // [vr,udr] [vl,vdl] // [hd,rld] [hu,rlu] // [hv,udrl,rlv,udh] ArrayList flags = new ArrayList (new int [] { 32, 8 + 4, 8, 4, 16, 1 + 2, 1, 2, 4 + 2, 8 + 2, 4 + 1, 8 + 1, 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8, 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1, 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32 }); byte [] offsets = new byte [] { 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14}; if (s.IndexOf ("BOX DRAWINGS ") >= 0) { int flag = 0; if (s.IndexOf (" UP") >= 0) flag |= 1; if (s.IndexOf (" DOWN") >= 0) flag |= 2; if (s.IndexOf (" RIGHT") >= 0) flag |= 4; if (s.IndexOf (" LEFT") >= 0) flag |= 8; if (s.IndexOf (" VERTICAL") >= 0) flag |= 16; if (s.IndexOf (" HORIZONTAL") >= 0) flag |= 32; int fidx = flags.IndexOf (flag); if (fidx >= 0) value = offsets [fidx]; } else if (s.IndexOf ("BLOCK") >= 0) { if (s.IndexOf ("ONE EIGHTH") >= 0) value = 0x12; else if (s.IndexOf ("ONE QUARTER") >= 0) value = 0x13; else if (s.IndexOf ("THREE EIGHTHS") >= 0) value = 0x14; else if (s.IndexOf ("HALF") >= 0) value = 0x15; else if (s.IndexOf ("FIVE EIGHTHS") >= 0) value = 0x16; else if (s.IndexOf ("THREE QUARTERS") >= 0) value = 0x17; else if (s.IndexOf ("SEVEN EIGHTHS") >= 0) value = 0x18; else value = 0x19; } else if (s.IndexOf ("SHADE") >= 0) value = 0x19; else if (s.IndexOf ("SQUARE") >= 0) value = 0xBC - 0xE5; else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0) value = 0xBE - 0xE5; else if (s.IndexOf ("RECTANGLE") >= 0) value = 0xBD - 0xE5; else if (s.IndexOf ("PARALLELOGRAM") >= 0) value = 0xBF - 0xE5; else if (s.IndexOf ("TRIANGLE") >= 0) { if (s.IndexOf ("UP-POINTING") >= 0) value = 0xC0 - 0xE5; else if (s.IndexOf ("RIGHT-POINTING") >= 0) value = 0xC1 - 0xE5; else if (s.IndexOf ("DOWN-POINTING") >= 0) value = 0xC2 - 0xE5; else if (s.IndexOf ("LEFT-POINTING") >= 0) value = 0xC3 - 0xE5; } else if (s.IndexOf ("POINTER") >= 0) { if (s.IndexOf ("RIGHT-POINTING") >= 0) value = 0xC4 - 0xE5; else if (s.IndexOf ("LEFT-POINTING") >= 0) value = 0xC5 - 0xE5; } else if (s.IndexOf ("DIAMOND") >= 0) value = 0xC6 - 0xE5; else if (s.IndexOf ("FISHEYE") >= 0) value = 0xC7 - 0xE5; else if (s.IndexOf ("LOZENGE") >= 0) value = 0xC8 - 0xE5; else if (s.IndexOf ("BULLSEYE") >= 0) value = 0xC9 - 0xE5; else if (s.IndexOf ("CIRCLE") >= 0) { if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE") value = 0xCA - 0xE5; else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE") value = 0xCB - 0xE5; else value = 0xC9 - 0xE5; } else if (s.IndexOf ("BULLET") >= 0) value = 0xCC - 0xE5; if (0x25DA <= cp && cp <= 0x25E5) value = 0xCD + cp - 0x25DA - 0xE5; // SPECIAL CASE: BOX DRAWING DIAGONAL patterns switch (cp) { case 0x2571: value = 0xF; break; case 0x2572: value = 0x10; break; case 0x2573: value = 0x11; break; } if (value != int.MinValue) boxValues.Add (new DictionaryEntry ( cp, value)); } // For some characters store the name and sort later // to determine sorting. if (0x2100 <= cp && cp <= 0x213F && Char.IsSymbol ((char) cp)) sortableCharNames.Add ( new DictionaryEntry (cp, name)); else if (0x3380 <= cp && cp <= 0x33DD) sortableCharNames.Add (new DictionaryEntry ( cp, name.Substring (7))); if (Char.GetUnicodeCategory ((char) cp) == UnicodeCategory.MathSymbol) { if (name.StartsWith ("CIRCLED ")) diacritical [cp] = 0xEE; if (name.StartsWith ("SQUARED ")) diacritical [cp] = 0xEF; } // diacritical weights by character name if (diacritics.Length != diacriticWeights.Length) throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length)); for (int d = 0; d < diacritics.Length; d++) { if (s.IndexOf (diacritics [d]) > 0) { diacritical [cp] += diacriticWeights [d]; if (s.IndexOf ("COMBINING") >= 0) diacritical [cp] -= (byte) 2; continue; } // also process "COMBINING blah" here // For now it is limited to cp < 0x0370 // if (cp < 0x0300 || cp >= 0x0370) // continue; string tmp = diacritics [d].TrimEnd (';'); if (tmp.IndexOf ("WITH ") == 0) tmp = tmp.Substring (4); tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp); if (name == tmp) { diacritical [cp] = (byte) (diacriticWeights [d] - 2); break; } //if (name == tmp) //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp); } // Two-step grep required for it. if (s.IndexOf ("FULL STOP") > 0 && (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0)) diacritical [cp] |= 0xF4; if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0) diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 : s.IndexOf ("CAPITAL") > 0 ? 5 : 4); // Arabic letter name if (0x0621 <= cp && cp <= 0x064A && Char.GetUnicodeCategory ((char) cp) == UnicodeCategory.OtherLetter) { byte value = (byte) (arabicNameMap.Count * 4 + 0x0B); switch (cp) { case 0x0621: case 0x0624: case 0x0626: // hamza, waw, yeh ... special cases. value = 0x07; break; case 0x0649: case 0x064A: value = 0x77; // special cases. break; default: // Get primary letter name i.e. // XXX part of ARABIC LETTER XXX yyy // e.g. that of "TEH MARBUTA" is "TEH". string letterName = (cp == 0x0640) ? // 0x0640 is special: it does // not start with ARABIC LETTER name : name.Substring (14); int tmpIdx = letterName.IndexOf (' '); letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx); //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName); if (arabicNameMap.ContainsKey (letterName)) value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]]; else arabicNameMap [letterName] = cp; break; } arabicLetterPrimaryValues [cp] = value; } // Japanese square letter if (0x3300 <= cp && cp <= 0x3357) if (!ExistsJIS (cp)) nonJisJapanese.Add (new NonJISCharacter (cp, name)); // normalizationType string decomp = values [4]; idx = decomp.IndexOf ('<'); if (idx >= 0) { switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) { case "full": decompType [cp] = DecompositionFull; break; case "sub": decompType [cp] = DecompositionSub; break; case "super": decompType [cp] = DecompositionSuper; break; case "small": decompType [cp] = DecompositionSmall; break; case "isolated": decompType [cp] = DecompositionIsolated; break; case "initial": decompType [cp] = DecompositionInitial; break; case "final": decompType [cp] = DecompositionFinal; break; case "medial": decompType [cp] = DecompositionMedial; break; case "noBreak": decompType [cp] = DecompositionNoBreak; break; case "compat": decompType [cp] = DecompositionCompat; break; case "fraction": decompType [cp] = DecompositionFraction; break; case "font": decompType [cp] = DecompositionFont; break; case "circle": decompType [cp] = DecompositionCircle; break; case "square": decompType [cp] = DecompositionSquare; break; case "wide": decompType [cp] = DecompositionWide; break; case "narrow": decompType [cp] = DecompositionNarrow; break; case "vertical": decompType [cp] = DecompositionVertical; break; default: throw new Exception ("Support NFKD type : " + decomp); } } else decompType [cp] = DecompositionCanonical; decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2); if (decomp.Length > 0) { string [] velems = decomp.Split (' '); int didx = decompValues.Count; decompIndex [cp] = didx; foreach (string v in velems) decompValues.Add (int.Parse (v, NumberStyles.HexNumber)); decompLength [cp] = velems.Length; // [decmpType] -> this_cp int targetCP = (int) decompValues [didx]; // for "(x)" it specially maps to 'x' . // FIXME: check if it is sane if (velems.Length == 3 && (int) decompValues [didx] == '(' && (int) decompValues [didx + 2] == ')') targetCP = (int) decompValues [didx + 1]; // special: 0x215F "1/" else if (cp == 0x215F) targetCP = '1'; else if (velems.Length > 1 && (targetCP < 0x4C00 || 0x9FBB < targetCP)) // skip them, except for CJK ideograph compat targetCP = 0; if (targetCP != 0) { Hashtable entry = (Hashtable) nfkdMap [targetCP]; if (entry == null) { entry = new Hashtable (); nfkdMap [targetCP] = entry; } entry [(byte) decompType [cp]] = cp; } } // numeric values if (values [5].Length > 0) decimalValue [cp] = decimal.Parse (values [5]); else if (values [6].Length > 0) decimalValue [cp] = decimal.Parse (values [6]); else if (values [7].Length > 0) { string decstr = values [7]; idx = decstr.IndexOf ('/'); if (cp == 0x215F) // special. "1/" decimalValue [cp] = 0x1; else if (idx > 0) // m/n decimalValue [cp] = decimal.Parse (decstr.Substring (0, idx)) / decimal.Parse (decstr.Substring (idx + 1)); else if (decstr [0] == '(' && decstr [decstr.Length - 1] == ')') // (n) decimalValue [cp] = decimal.Parse (decstr.Substring (1, decstr.Length - 2)); else if (decstr [decstr.Length - 1] == '.') // n. decimalValue [cp] = decimal.Parse (decstr.Substring (0, decstr.Length - 1)); else decimalValue [cp] = decimal.Parse (decstr); } } void ParseDerivedCoreProperties (string filename) { // IsUppercase using (StreamReader file = new StreamReader (filename)) { for (int line = 1; file.Peek () >= 0; line++) { try { ProcessDerivedCorePropLine (file.ReadLine ()); } catch (Exception) { Console.Error.WriteLine ("**** At line " + line); throw; } } } } void ProcessDerivedCorePropLine (string s) { int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) return; string cpspec = s.Substring (0, idx); idx = cpspec.IndexOf (".."); NumberStyles nf = NumberStyles.HexNumber | NumberStyles.AllowTrailingWhite; int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf); int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf); string value = s.Substring (cpspec.Length + 1).Trim (); // FIXME: use index if (cp > char.MaxValue) return; switch (value) { case "Uppercase": for (int x = cp; x <= cpEnd; x++) isUppercase [x] = true; break; } } void ParseScripts (string filename) { ArrayList gurmukhi = new ArrayList (); ArrayList gujarati = new ArrayList (); ArrayList georgian = new ArrayList (); ArrayList thaana = new ArrayList (); using (StreamReader file = new StreamReader (filename)) { while (file.Peek () >= 0) { string s = file.ReadLine (); int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) continue; string cpspec = s.Substring (0, idx); idx = cpspec.IndexOf (".."); NumberStyles nf = NumberStyles.HexNumber | NumberStyles.AllowTrailingWhite; int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf); int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf); string value = s.Substring (cpspec.Length + 1).Trim (); // FIXME: use index if (cp > char.MaxValue) continue; switch (value) { case "Gurmukhi": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) gurmukhi.Add ((char) x); break; case "Gujarati": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) gujarati.Add ((char) x); break; case "Georgian": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) georgian.Add ((char) x); break; case "Thaana": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) thaana.Add ((char) x); break; } } } gurmukhi.Sort (UCAComparer.Instance); gujarati.Sort (UCAComparer.Instance); georgian.Sort (UCAComparer.Instance); thaana.Sort (UCAComparer.Instance); orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char)); orderedGujarati = (char []) gujarati.ToArray (typeof (char)); orderedGeorgian = (char []) georgian.ToArray (typeof (char)); orderedThaana = (char []) thaana.ToArray (typeof (char)); } void ParseJISOrder (string filename) { int line = 1; try { using (StreamReader file = new StreamReader (filename)) { for (;file.Peek () >= 0; line++) ProcessJISOrderLine (file.ReadLine ()); } } catch (Exception) { Console.Error.WriteLine ("---- line {0}", line); throw; } } char [] ws = new char [] {'\t', ' '}; void ProcessJISOrderLine (string s) { int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx).Trim (); if (s.Length == 0) return; idx = s.IndexOfAny (ws); if (idx < 0) return; // They start with "0x" so cut them out. int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber); int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber); jisJapanese.Add (new JISCharacter (cp, jis)); } void ParseCJK (string zhXML, string jaXML, string koXML) { XmlDocument doc = new XmlDocument (); doc.XmlResolver = null; int v; string s; string category; int offset; ushort [] arr; // Chinese Simplified category = "chs"; arr = cjkCHS; offset = 0;//char.MaxValue - arr.Length; doc.Load (zhXML); s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText; v = 0x8008; foreach (char c in s) { if (c < '\u3100') Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; } } // Chinese Traditional category = "cht"; arr = cjkCHT; offset = 0;//char.MaxValue - arr.Length; s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText; v = 0x8002; foreach (char c in s) { if (c < '\u4E00') Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; } } // Japanese category = "ja"; arr = cjkJA; offset = 0;//char.MaxValue - arr.Length; // SPECIAL CASES arr [0x4EDD] = 0x8002; // Chinese repetition mark? arr [0x337B] = 0x8004; // Those 4 characters are Gengou arr [0x337E] = 0x8005; arr [0x337D] = 0x8006; arr [0x337C] = 0x8007; v = 0x8008; foreach (JISCharacter jc in jisJapanese) { if (jc.JIS < 0x8800) continue; char c = (char) jc.CP; if (c < '\u4E00') // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); continue; else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; // SPECIAL CASES: if (c == '\u662D') // U+337C continue; if (c == '\u5927') // U+337D continue; if (c == '\u5E73') // U+337B continue; if (c == '\u660E') // U+337E continue; if (c == '\u9686') // U+F9DC continue; // FIXME: there are still remaining // characters after U+FA0C. // for (int k = 0; k < char.MaxValue; k++) { for (int k = 0; k < '\uFA0D'; k++) { if (decompIndex [k] == 0 || IsIgnorable (k)) continue; if (decompValues [decompIndex [k]] == c /*&& decompLength [k] == 1*/ || decompLength [k] == 3 && decompValues [decompIndex [k] + 1] == c) { arr [k - offset] = (ushort) v++; if (v % 256 == 0) v += 2; } } } } // Korean // Korean weight is somewhat complex. It first shifts // Hangul category from 52-x to 80-x (they are anyways // computed). CJK ideographs are placed at secondary // weight, like XX YY 01 zz 01, where XX and YY are // corresponding "reset" value and zz is 41,43,45... // // Unlike chs,cht and ja, Korean value is a combined // ushort which is computed as category // category = "ko"; arr = cjkKO; offset = 0;//char.MaxValue - arr.Length; doc.Load (koXML); foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) { XmlElement sc = (XmlElement) reset.NextSibling; // compute "category" and "level 1" for the // target "reset" Hangle syllable char rc = reset.InnerText [0]; int ri = ((int) rc - 0xAC00) + 1; ushort p = (ushort) ((ri / 254) * 256 + (ri % 254) + 2); // Place the characters after the target. s = sc.InnerText; v = 0x41; foreach (char c in s) { arr [(int) c - offset] = p; cjkKOlv2 [(int) c - offset] = (byte) v; v += 2; } } } #endregion #region Generation void FillIgnorables () { for (int i = 0; i <= char.MaxValue; i++) { if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.OtherNotAssigned) continue; if (IsIgnorable (i)) ignorableFlags [i] |= 1; if (IsIgnorableSymbol (i)) ignorableFlags [i] |= 2; if (IsIgnorableNonSpacing (i)) ignorableFlags [i] |= 4; } } void ModifyUnidata () { // Modify some decomposition equivalence for (int i = 0xFE31; i <= 0xFE34; i++) { decompType [i] = 0; decompIndex [i] = 0; decompLength [i] = 0; } decompType [0x037E] = 0; decompIndex [0x037E] = 0; decompLength [0x037E] = 0; // Hangzhou numbers for (int i = 0x3021; i <= 0x3029; i++) diacritical [i] = 0x4E; // Korean parens numbers for (int i = 0x3200; i <= 0x321C; i++) diacritical [i] = 0xA; for (int i = 0x3260; i <= 0x327B; i++) diacritical [i] = 0xC; // LAMESPEC: these remapping should not be done. // Windows have incorrect CJK compat mappings. decompValues [decompIndex [0x32A9]] = 0x91AB; decompLength [0x323B] = 1; decompValues [decompIndex [0x323B]] = 0x5B78; decompValues [decompIndex [0x32AB]] = 0x5B78; decompValues [decompIndex [0x32A2]] = 0x5BEB; decompLength [0x3238] = 1; decompValues [decompIndex [0x3238]] = 0x52DE; decompValues [decompIndex [0x3298]] = 0x52DE; // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things) decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty) decompValues [decompIndex [0xFA0C]] = 0x5140; decompLength [0xFA0C] = 1; decompIndex [0xF929] = decompLength [0xF929] = 0; decompValues [decompIndex [0xF92C]] = 0x90DE; } void ModifyParsedValues () { // some cyrillic diacritical weight. They seem to be // based on old character names, so it's quicker to // set them directly here. diacritical [0x0496] = diacritical [0x0497] = 7; diacritical [0x0498] = diacritical [0x0499] = 0x1A; diacritical [0x049A] = diacritical [0x049B] = 0x17; diacritical [0x049C] = diacritical [0x049D] = 9; diacritical [0x049E] = diacritical [0x049F] = 4; diacritical [0x04A0] = diacritical [0x04A1] = 0xA; diacritical [0x04A2] = diacritical [0x04A3] = 7; diacritical [0x04A4] = diacritical [0x04A5] = 8; // number, secondary weights byte weight = 0x38; int [] numarr = numberSecondaryWeightBounds; for (int i = 0; i < numarr.Length; i += 2, weight++) for (int cp = numarr [i]; cp < numarr [i + 1]; cp++) if (Char.IsNumber ((char) cp)) diacritical [cp] = weight; // Update name part of named characters for (int i = 0; i < sortableCharNames.Count; i++) { DictionaryEntry de = (DictionaryEntry) sortableCharNames [i]; int cp = (int) de.Key; string renamed = null; switch (cp) { case 0x2101: renamed = "A_1"; break; case 0x33C3: renamed = "A_2"; break; case 0x2105: renamed = "C_1"; break; case 0x2106: renamed = "C_2"; break; case 0x211E: renamed = "R1"; break; case 0x211F: renamed = "R2"; break; // Remove some of them! case 0x2103: case 0x2109: case 0x2116: case 0x2117: case 0x2118: case 0x2125: case 0x2127: case 0x2129: case 0x212E: case 0x2132: sortableCharNames.RemoveAt (i); i--; continue; } if (renamed != null) sortableCharNames [i] = new DictionaryEntry (cp, renamed); } } void GenerateCore () { UnicodeCategory uc; #region Specially ignored // 01 // This will raise "Defined" flag up. // FIXME: Check If it is really fine. Actually for // Japanese voice marks this code does remapping. foreach (char c in specialIgnore) map [(int) c] = new CharMapEntry (0, 0, 0); #endregion #region Extenders (FF FF) fillIndex [0xFF] = 0xFF; char [] specialBiggest = new char [] { '\u3005', '\u3031', '\u3032', '\u309D', '\u309E', '\u30FC', '\u30FD', '\u30FE', '\uFE7C', '\uFE7D', '\uFF70'}; foreach (char c in specialBiggest) AddCharMap (c, 0xFF, 0); #endregion #region Variable weights // Controls : 06 03 - 06 3D fillIndex [0x6] = 3; for (int i = 0; i < 65536; i++) { if (IsIgnorable (i)) continue; char c = (char) i; uc = Char.GetUnicodeCategory (c); // NEL is whitespace but not ignored here. if (uc == UnicodeCategory.Control && !Char.IsWhiteSpace (c) || c == '\u0085') AddCharMap (c, 6, 1); } // Apostrophe 06 80 fillIndex [0x6] = 0x80; AddCharMap ('\'', 6, 0); AddCharMap ('\uFF07', 6, 1); AddCharMap ('\uFE63', 6, 1); // SPECIAL CASE: fill FE32 here in prior to be added // at 2013. Windows does not always respect NFKD. map [0xFE32] = new CharMapEntry (6, 0x90, 0); // Hyphen/Dash : 06 81 - 06 90 for (int i = 0; i < char.MaxValue; i++) { if (!IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.DashPunctuation) { AddCharMapGroup2 ((char) i, 6, 1, 0); if (i == 0x2011) { // SPECIAL: add 2027 and 2043 // Maybe they are regarded the // same hyphens in "central" // position. AddCharMap ('\u2027', 6, 1); AddCharMap ('\u2043', 6, 1); } } } // They are regarded as primarily equivalent to '-' map [0x208B] = new CharMapEntry (6, 0x82, 0); map [0x207B] = new CharMapEntry (6, 0x82, 0); map [0xFF0D] = new CharMapEntry (6, 0x82, 0); // Arabic variable weight chars 06 A0 - fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) AddArabicCharMap ((char) i); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda AddCharMapGroup ('\u0651', 6, 1, 0); #endregion #region Nonspacing marks // 01 // FIXME: 01 03 - 01 B6 ... annoyance :( // Combining diacritical marks: 01 DC - fillIndex [0x1] = 0x41; for (int i = 0x030E; i <= 0x0326; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x0329; i <= 0x0334; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1]++; for (int i = 0x0339; i <= 0x0341; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0x74; for (int i = 0x0346; i <= 0x0348; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02BE; i <= 0x02BF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02C1; i <= 0x02C5; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02CE; i <= 0x02CF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1]++; for (int i = 0x02D1; i <= 0x02D3; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); AddCharMap ('\u02DE', 0x1, 1); for (int i = 0x02E4; i <= 0x02E9; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); // FIXME: needs more love here (it should eliminate // all the hacky code above). for (int i = 0x0300; i < 0x0370; i++) if (!IsIgnorable (i) && diacritical [i] != 0 /* especiall here*/ && !map [i].Defined) map [i] = new CharMapEntry ( 0x1, 0x1, diacritical [i]); // Cyrillic and Armenian nonspacing mark fillIndex [0x1] = 0x94; for (int i = 0x400; i < 0x580; i++) if (!IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); fillIndex [0x1] = 0x8D; // syriac dotted nonspacing marks (1) AddCharMap ('\u0740', 0x1, 1); AddCharMap ('\u0741', 0x1, 1); AddCharMap ('\u0742', 0x1, 1); // syriac oblique nonspacing marks AddCharMap ('\u0747', 0x1, 1); AddCharMap ('\u0748', 0x1, 1); // syriac dotted nonspacing marks (2) fillIndex [0x1] = 0x94; // this reset is mandatory AddCharMap ('\u0732', 0x1, 1); AddCharMap ('\u0735', 0x1, 1); AddCharMap ('\u0738', 0x1, 1); AddCharMap ('\u0739', 0x1, 1); AddCharMap ('\u073C', 0x1, 1); // SPECIAL CASES: superscripts AddCharMap ('\u073F', 0x1, 1); AddCharMap ('\u0711', 0x1, 1); // syriac "DOTS" for (int i = 0x0743; i <= 0x0746; i++) AddCharMap ((char) i, 0x1, 1); for (int i = 0x0730; i <= 0x0780; i++) if (!map [i].Defined && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 0x1, 1); // LAMESPEC: It should not stop at '\u20E1'. There are // a few more characters (that however results in // overflow of level 2 unless we start before 0xDD). fillIndex [0x1] = 0xDD; for (int i = 0x20D0; i <= 0x20DC; i++) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0xEC; for (int i = 0x20DD; i <= 0x20E1; i++) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0x7; for (int i = 0x302A; i <= 0x302D; i++) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0x50; // I wonder how they are sorted for (int i = 0x02D4; i <= 0x02D7; i++) AddCharMap ((char) i, 0x1, 1); // They are not part of Nonspacing marks, but have // only diacritical weight. for (int i = 0x3099; i <= 0x309C; i++) map [i] = new CharMapEntry (1, 1, 1); map [0xFF9E] = new CharMapEntry (1, 1, 1); map [0xFF9F] = new CharMapEntry (1, 1, 2); map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1); map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1); for (int i = 0x30FC; i <= 0x30FE; i++) map [i] = new CharMapEntry (0xFF, 0xFF, 1); fillIndex [0x1] = 0xA; for (int i = 0x0951; i <= 0x0954; i++) AddCharMap ((char) i, 0x1, 2); #endregion #region Whitespaces // 07 03 - fillIndex [0x7] = 0x2; AddCharMap (' ', 0x7, 2); AddCharMap ('\u00A0', 0x7, 1); for (int i = 9; i <= 0xD; i++) AddCharMap ((char) i, 0x7, 1); for (int i = 0x2000; i <= 0x200B; i++) AddCharMap ((char) i, 0x7, 1); fillIndex [0x7] = 0x17; AddCharMapGroup ('\u2028', 0x7, 1, 0); AddCharMapGroup ('\u2029', 0x7, 1, 0); // Characters which used to represent layout control. // LAMESPEC: Windows developers seem to have thought // that those characters are kind of whitespaces, // while they aren't. AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol AddCharMap ('\u2423', 0x7, 1, 0); // open box #endregion // category 09 - continued symbols from 08 fillIndex [0x9] = 2; // misc tech mark for (int cp = 0x2300; cp <= 0x237A; cp++) AddCharMap ((char) cp, 0x9, 1, 0); // arrows byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; foreach (DictionaryEntry de in arrowValues) { int idx = (int) de.Value; int cp = (int) de.Key; if (map [cp].Defined) continue; fillIndex [0x9] = (byte) (0xD8 + idx); AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]); arrowLv2 [idx]++; } // boxes byte [] boxLv2 = new byte [128]; // 0-63 will be used for those offsets are positive, // and 64-127 are for negative ones. for (int i = 0; i < boxLv2.Length; i++) boxLv2 [i] = 3; foreach (DictionaryEntry de in boxValues) { int cp = (int) de.Key; int off = (int) de.Value; if (map [cp].Defined) continue; if (off < 0) { fillIndex [0x9] = (byte) (0xE5 + off); AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++); } else { fillIndex [0x9] = (byte) (0xE5 + off); AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++); } } // Some special characters (slanted) fillIndex [0x9] = 0xF4; AddCharMap ('\u2571', 0x9, 3); AddCharMap ('\u2572', 0x9, 3); AddCharMap ('\u2573', 0x9, 3); // FIXME: implement 0A #region Symbols fillIndex [0xA] = 2; // byte currency symbols for (int cp = 0; cp < 0x100; cp++) { uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.CurrencySymbol && cp != '$') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // byte other symbols for (int cp = 0; cp < 0x100; cp++) { if (cp == 0xA6) continue; // SPECIAL: skip FIXME: why? uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.OtherSymbol || cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // U+30FB here AddCharMapGroup ('\u30FB', 0xA, 1, 0); for (int cp = 0x2020; cp <= 0x2031; cp++) if (Char.IsPunctuation ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // SPECIAL CASES: why? AddCharMap ('\u203B', 0xA, 1, 0); AddCharMap ('\u2040', 0xA, 1, 0); AddCharMap ('\u2041', 0xA, 1, 0); AddCharMap ('\u2042', 0xA, 1, 0); for (int cp = 0x20A0; cp <= 0x20AB; cp++) AddCharMap ((char) cp, 0xA, 1, 0); // 3004 is skipped at first... for (int cp = 0x3010; cp <= 0x3040; cp++) if (Char.IsSymbol ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // SPECIAL CASES: added here AddCharMap ('\u3004', 0xA, 1, 0); AddCharMap ('\u327F', 0xA, 1, 0); for (int cp = 0x2600; cp <= 0x2613; cp++) AddCharMap ((char) cp, 0xA, 1, 0); // Dingbats for (int cp = 0x2620; cp <= 0x2770; cp++) if (Char.IsSymbol ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // OCR for (int i = 0x2440; i < 0x2460; i++) AddCharMap ((char) i, 0xA, 1, 0); // SPECIAL CASES: why? AddCharMap ('\u0E3F', 0xA, 1, 0); AddCharMap ('\u2117', 0xA, 1, 0); AddCharMap ('\u20AC', 0xA, 1, 0); #endregion #region Numbers // 0C 02 - 0C E1 fillIndex [0xC] = 2; // 9F8 : Bengali "one less than the denominator" AddCharMap ('\u09F8', 0xC, 1, 0x3C); ArrayList numbers = new ArrayList (); for (int i = 0; i < 65536; i++) if (!IsIgnorable (i) && Char.IsNumber ((char) i) && (i < 0x3190 || 0x32C0 < i)) // they are CJK characters numbers.Add (i); ArrayList numberValues = new ArrayList (); foreach (int i in numbers) numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i])); // SPECIAL CASE: Cyrillic Thousand sign numberValues.Add (new DictionaryEntry (0x0482, 1000m)); numberValues.Sort (DecimalDictionaryValueComparer.Instance); //foreach (DictionaryEntry de in numberValues) //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]); // FIXME: fillIndex adjustment lines are too // complicated. It must be simpler. decimal prevValue = -1; foreach (DictionaryEntry de in numberValues) { int cp = (int) de.Key; decimal currValue = (decimal) de.Value; bool addnew = false; if (prevValue < currValue && prevValue - (int) prevValue == 0 && prevValue >= 1) { addnew = true; // Process Hangzhou and Roman numbers // There are some SPECIAL cases. if (currValue != 4) // no increment for 4 fillIndex [0xC]++; int xcp; if (currValue <= 13) { if (currValue == 4) fillIndex [0xC]++; // SPECIAL CASE if (currValue == 11) AddCharMap ('\u0BF0', 0xC, 1); xcp = (int) prevValue + 0x2160 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = (int) prevValue + 0x2170 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; } if (currValue < 12) fillIndex [0xC]++; if (currValue <= 10) { xcp = (int) prevValue + 0x3021 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; } } if (prevValue < currValue) prevValue = currValue; if (map [cp].Defined) continue; // HangZhou and Roman are add later // (code is above) if (0x3021 <= cp && cp < 0x302A || 0x2160 <= cp && cp < 0x216C || 0x2170 <= cp && cp < 0x217C) continue; if (cp == 0x215B) // FIXME: why? fillIndex [0xC] += 2; else if (cp == 0x3021) // FIXME: why? fillIndex [0xC]++; if (addnew || cp <= '9') { int mod = (int) currValue - 1; int xcp; if (1 <= currValue && currValue <= 11) { xcp = mod + 0x2776; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2780; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x278A; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } if (1 <= currValue && currValue <= 20) { xcp = mod + 0x2460; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2474; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2488; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } } if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9) fillIndex [0xC]++; AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true); switch (cp) { // Maybe Bengali digit numbers do not increase // indexes, but 0x09E6 does. case 0x09E7: case 0x09E8: case 0x09E9: case 0x09EA: // SPECIAL CASES case 0x0BF0: case 0x2180: case 0x2181: break; // SPECIAL CASE case 0x0BF1: fillIndex [0xC]++; break; default: if (currValue < 11 || currValue == 1000) fillIndex [0xC]++; break; } // Add special cases that are not regarded as // numbers in UnicodeCategory speak. if (cp == '5') { // TONE FIVE AddCharMapGroup ('\u01BD', 0xC, 0, 0); AddCharMapGroup ('\u01BC', 0xC, 1, 0); } else if (cp == '2' || cp == '6') // FIXME: why? fillIndex [0xC]++; } // 221E: infinity fillIndex [0xC] = 0xFF; AddCharMap ('\u221E', 0xC, 1); #endregion #region Letters and NonSpacing Marks (general) // ASCII Latin alphabets for (int i = 0; i < alphabets.Length; i++) AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); // non-ASCII Latin alphabets // FIXME: there is no such characters that are placed // *after* "alphabets" array items. This is nothing // more than a hack that creates dummy weight for // primary characters. for (int i = 0x0080; i < 0x0300; i++) { if (!Char.IsLetter ((char) i)) continue; // For those Latin Letters which has NFKD are // not added as independent primary character. if (decompIndex [i] != 0) continue; // SPECIAL CASES: // 1.some alphabets have primarily // equivalent ASCII alphabets. // 2.some have independent primary weights, // but inside a-to-z range. // 3.there are some expanded characters that // are not part of Unicode Standard NFKD. // 4. some characters are letter in IsLetter // but not in sortkeys (maybe unicode version // difference caused it). switch (i) { // 1. skipping them does not make sense // case 0xD0: case 0xF0: case 0x131: case 0x138: // case 0x184: case 0x185: case 0x186: case 0x189: // case 0x18D: case 0x18E: case 0x18F: case 0x190: // case 0x194: case 0x195: case 0x196: case 0x19A: // case 0x19B: case 0x19C: // 2. skipping them does not make sense // case 0x14A: // Ng // case 0x14B: // ng // 3. case 0xC6: // AE case 0xE6: // ae case 0xDE: // Icelandic Thorn case 0xFE: // Icelandic Thorn case 0xDF: // German ss case 0xFF: // German ss // 4. case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // not classified yet // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9: // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8: // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF: // case 0x1DD: continue; } AddCharMapGroup ((char) i, 0xE, 1, 0); } // Greek and Coptic fillIndex [0xF] = 02; for (int i = 0x0380; i < 0x0390; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); fillIndex [0xF] = 02; for (int i = 0x0391; i < 0x03CF; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); fillIndex [0xF] = 0x40; for (int i = 0x03D0; i < 0x0400; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); // Cyrillic. // Cyrillic letters are sorted like Latin letters i.e. // containing culture-specific letters between the // standard Cyrillic sequence. // // We can't use UCA here; it has different sorting. char [] orderedCyrillic = new char [] { '\u0430', '\u0431', '\u0432', '\u0433', '\u0434', '\u0452', // DJE for Serbocroatian '\u0435', '\u0454', // IE for Ukrainian '\u0436', '\u0437', '\u0455', // DZE '\u0438', '\u0456', // Byelorussian-Ukrainian I '\u0457', // YI '\u0439', '\u0458', // JE '\u043A', '\u043B', '\u0459', // LJE '\u043C', '\u043D', '\u045A', // NJE '\u043E', // 4E9 goes here. '\u043F', '\u0440', '\u0441', '\u0442', '\u045B', // TSHE for Serbocroatian '\u0443', '\u045E', // Short U for Byelorussian '\u04B1', // Straight U w/ stroke (diacritical!) '\u0444', '\u0445', '\u0446', '\u0447', '\u045F', // DZHE '\u0448', '\u0449', '\u044A', '\u044B', '\u044C', '\u044D', '\u044E', '\u044F'}; // For some characters here is a map to basic cyrillic // letters. See UnicodeData.txt character names for // the sources. Here I simply declare an equiv. array. // The content characters are map from U+490(,491), // skipping small letters. char [] cymap_src = new char [] { '\u0433', '\u0433', '\u0433', '\u0436', '\u0437', '\u043A', '\u043A', '\u043A', '\u043A', '\u043D', '\u043D', '\u043F', '\u0445', '\u0441', '\u0442', '\u0443', '\u0443', '\u0445', '\u0446', '\u0447', '\u0447', '\u0432', '\u0435', '\u0435', '\u0406', '\u0436', '\u043A', '\u043D', '\u0447', '\u0435'}; fillIndex [0x10] = 0x8D; for (int i = 0x0460; i < 0x0481; i++) { if (Char.IsLetter ((char) i)) { if (i == 0x0476) // U+476/477 have the same // primary weight as U+474/475. fillIndex [0x10] -= 3; AddLetterMap ((char) i, 0x10, 3); } } fillIndex [0x10] = 0x6; for (int i = 0; i < orderedCyrillic.Length; i++) { char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture); if (!IsIgnorable ((int) c) && Char.IsLetter (c) && !map [c].Defined) { AddLetterMap (c, 0x10, 0); fillIndex [0x10] += 3; } } for (int i = 0; i < cymap_src.Length; i++) { char c = cymap_src [i]; fillIndex [0x10] = map [c].Level1; int c2 = 0x0490 + i * 2; AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false); } // Armenian fillIndex [0x11] = 0x3; fillIndex [0x1] = 0x98; for (int i = 0x0531; i < 0x0586; i++) { if (i == 0x0559 || i == 0x55A) AddCharMap ((char) i, 1, 1); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x11, 1); } // Hebrew // -Letters fillIndex [0x12] = 0x2; for (int i = 0x05D0; i < 0x05FF; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x12, 1); // -Accents fillIndex [0x1] = 0x3; for (int i = 0x0591; i <= 0x05C2; i++) { if (i == 0x05A3 || i == 0x05BB) fillIndex [0x1]++; if (i != 0x05BE) AddCharMap ((char) i, 0x1, 1); } // Arabic fillIndex [0x1] = 0x8E; fillIndex [0x13] = 0x3; for (int i = 0x0621; i <= 0x064A; i++) { // Abjad if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.OtherLetter) { // FIXME: arabic nonspacing marks are // in different order. AddCharMap ((char) i, 0x1, 1); continue; } // map [i] = new CharMapEntry (0x13, // (byte) arabicLetterPrimaryValues [i], 1); fillIndex [0x13] = (byte) arabicLetterPrimaryValues [i]; byte formDiacritical = 8; // default // SPECIAL CASES: switch (i) { case 0x0622: formDiacritical = 9; break; case 0x0623: formDiacritical = 0xA; break; case 0x0624: formDiacritical = 5; break; case 0x0625: formDiacritical = 0xB; break; case 0x0626: formDiacritical = 7; break; case 0x0649: formDiacritical = 5; break; case 0x064A: formDiacritical = 7; break; } AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false); } for (int i = 0x0670; i < 0x0673; i++) map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670)); fillIndex [0x13] = 0x84; for (int i = 0x0674; i < 0x06D6; i++) if (Char.IsLetter ((char) i)) AddLetterMapCore ((char) i, 0x13, 1, 0, false); // Devanagari // FIXME: this could be fixed in more decent way for (int i = 0x0958; i <= 0x095F; i++) diacritical [i] = 8; // FIXME: it does seem straight codepoint mapping. fillIndex [0x14] = 04; for (int i = 0x0901; i < 0x0905; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xB; for (int i = 0x0905; i < 0x093A; i++) { if (i == 0x0928) AddCharMap ('\u0929', 0x14, 0, 8); if (i == 0x0930) AddCharMap ('\u0931', 0x14, 0, 8); if (i == 0x0933) AddCharMap ('\u0934', 0x14, 0, 8); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x14, 4); if (i == 0x090B) AddCharMap ('\u0960', 0x14, 4); if (i == 0x090C) AddCharMap ('\u0961', 0x14, 4); } fillIndex [0x14] = 0xDA; for (int i = 0x093E; i < 0x0945; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xEC; for (int i = 0x0945; i < 0x094F; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); // Bengali // -Letters fillIndex [0x15] = 02; for (int i = 0x0980; i < 0x9FF; i++) { if (IsIgnorable (i)) continue; if (i == 0x09E0) fillIndex [0x15] = 0x3B; switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: case UnicodeCategory.OtherNumber: continue; } AddLetterMap ((char) i, 0x15, 1); } // -Signs fillIndex [0x1] = 0x3; for (int i = 0x0981; i < 0x0A00; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 0x1, 1); // Gurmukhi. orderedGurmukhi is from UCA // FIXME: it does not look equivalent to UCA. fillIndex [0x16] = 04; fillIndex [0x1] = 3; for (int i = 0; i < orderedGurmukhi.Length; i++) { char c = orderedGurmukhi [i]; if (IsIgnorable ((int) c)) continue; if (IsIgnorableNonSpacing (c)) { AddLetterMap (c, 0x1, 1); continue; } if (c == '\u0A3C' || c == '\u0A4D' || '\u0A66' <= c && c <= '\u0A71') continue; // SPECIAL CASES byte shift = 4; switch (c) { case '\u0A33': case '\u0A36': case '\u0A16': case '\u0A17': case '\u0A5B': case '\u0A5E': shift = 0; break; } if (c == '\u0A3E') // Skip fillIndex [0x16] = 0xC0; AddLetterMap (c, 0x16, shift); } // Gujarati. orderedGujarati is from UCA fillIndex [0x17] = 0x4; // nonspacing marks map [0x0A4D] = new CharMapEntry (1, 0, 0x3); map [0x0ABD] = new CharMapEntry (1, 0, 0x3); map [0x0A3C] = new CharMapEntry (1, 0, 0x4); map [0x0A71] = new CharMapEntry (1, 0, 0x6); map [0x0ABC] = new CharMapEntry (1, 0, 0xB); map [0x0A70] = new CharMapEntry (1, 0, 0xE); // letters go first. for (int i = 0; i < orderedGujarati.Length; i++) { // SPECIAL CASE char c = orderedGujarati [i]; if (Char.IsLetter (c)) { // SPECIAL CASES if (c == '\u0AB3' || c == '\u0A32') continue; if (c == '\u0A33') { AddCharMap ('\u0A32', 0x17, 0); AddCharMap ('\u0A33', 0x17, 4, 4); continue; } if (c == '\u0A8B') AddCharMap ('\u0AE0', 0x17, 0, 5); AddCharMap (c, 0x17, 4); if (c == '\u0AB9') AddCharMap ('\u0AB3', 0x17, 6); } } // non-letters byte gujaratiShift = 4; fillIndex [0x17] = 0xC0; for (int i = 0; i < orderedGujarati.Length; i++) { char c = orderedGujarati [i]; if (fillIndex [0x17] == 0xCC) gujaratiShift = 3; if (!Char.IsLetter (c)) { // SPECIAL CASES if (c == '\u0A82') AddCharMap ('\u0A81', 0x17, 2); if (c == '\u0AC2') fillIndex [0x17]++; AddLetterMap (c, 0x17, gujaratiShift); } } // Oriya fillIndex [0x1] = 03; fillIndex [0x18] = 02; for (int i = 0x0B00; i < 0x0B7F; i++) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: AddLetterMap ((char) i, 0x1, 1); continue; } AddLetterMap ((char) i, 0x18, 1); } // Tamil fillIndex [0x19] = 2; AddCharMap ('\u0BD7', 0x19, 0); fillIndex [0x19] = 0xA; // vowels for (int i = 0x0B82; i <= 0x0B94; i++) if (!IsIgnorable ((char) i)) AddCharMap ((char) i, 0x19, 2); // special vowel fillIndex [0x19] = 0x28; // The array for Tamil consonants is a constant. // Windows have almost similar sequence to TAM from // tamilnet but a bit different in Grantha. for (int i = 0; i < orderedTamilConsonants.Length; i++) AddLetterMap (orderedTamilConsonants [i], 0x19, 4); // combining marks fillIndex [0x19] = 0x82; for (int i = 0x0BBE; i < 0x0BCD; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.SpacingCombiningMark || i == 0x0BC0) AddLetterMap ((char) i, 0x19, 2); // Telugu fillIndex [0x1A] = 0x4; for (int i = 0x0C00; i < 0x0C62; i++) { if (i == 0x0C55 || i == 0x0C56) continue; // skip AddCharMap ((char) i, 0x1A, 3); char supp = (i == 0x0C0B) ? '\u0C60': i == 0x0C0C ? '\u0C61' : char.MinValue; if (supp == char.MinValue) continue; AddCharMap (supp, 0x1A, 3); } // Kannada fillIndex [0x1B] = 4; for (int i = 0x0C80; i < 0x0CE5; i++) { if (i == 0x0CD5 || i == 0x0CD6) continue; // ignore if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE) continue; // shift after 0xCB9 AddCharMap ((char) i, 0x1B, 3); if (i == 0x0CB9) { // SPECIAL CASES: but why? AddCharMap ('\u0CB1', 0x1B, 3); // RRA AddCharMap ('\u0CB3', 0x1B, 3); // LLA AddCharMap ('\u0CDE', 0x1B, 3); // FA } if (i == 0x0CB2) AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL } // Malayalam fillIndex [0x1C] = 2; fillIndex [0x1] = 3; for (int i = 0x0D02; i < 0x0D61; i++) { // FIXME: I avoided MSCompatUnicodeTable usage // here (it results in recursion). So check if // using NonSpacingMark makes sense or not. if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) // if (!MSCompatUnicodeTable.IsIgnorable ((char) i)) AddCharMap ((char) i, 0x1C, 1); else if (!IsIgnorable ((char) i)) AddCharMap ((char) i, 1, 1); } // Thai ... note that it breaks 0x1E wall after E2B! // Also, all Thai characters have level 2 value 3. fillIndex [0x1E] = 2; fillIndex [0x1] = 3; for (int i = 0xE40; i <= 0xE44; i++) AddCharMap ((char) i, 0x1E, 1, 3); for (int i = 0xE01; i < 0xE2B; i++) AddCharMap ((char) i, 0x1E, 6, 3); fillIndex [0x1F] = 5; for (int i = 0xE2B; i < 0xE30; i++) AddCharMap ((char) i, 0x1F, 6, 3); fillIndex [0x1F] = 0x1E; for (int i = 0xE30; i < 0xE3B; i++) AddCharMap ((char) i, 0x1F, 1, 3); // some Thai characters remains. char [] specialThai = new char [] {'\u0E45', '\u0E46', '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'}; foreach (char c in specialThai) AddCharMap (c, 0x1F, 1, 3); for (int i = 0xE00; i < 0xE80; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); // Lao fillIndex [0x1F] = 2; fillIndex [0x1] = 3; for (int i = 0xE80; i < 0xEDF; i++) { if (IsIgnorable ((char) i)) continue; else if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x1F, 1); else if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); } // Georgian. orderedGeorgian is from UCA DUCET. fillIndex [0x21] = 5; for (int i = 0; i < orderedGeorgian.Length; i++) { char c = orderedGeorgian [i]; if (map [(int) c].Defined) continue; AddCharMap (c, 0x21, 0); if (c < '\u10F6') AddCharMap ((char) (c - 0x30), 0x21, 0); fillIndex [0x21] += 5; } // Japanese Kana. fillIndex [0x22] = 2; int kanaOffset = 0x3041; byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1}; for (int gyo = 0; gyo < 9; gyo++) { for (int dan = 0; dan < 5; dan++) { if (gyo == 7 && dan % 2 == 1) { // 'ya'-gyo fillIndex [0x22]++; kanaOffset -= 2; // There is no space for yi and ye. continue; } int cp = kanaOffset + dan * kanaLines [gyo]; // small lines (a-gyo, ya-gyo) if (gyo == 0 || gyo == 7) { AddKanaMap (cp, 1); // small AddKanaMap (cp + 1, 1); } else AddKanaMap (cp, kanaLines [gyo]); fillIndex [0x22]++; if (cp == 0x30AB) { // add small 'ka' (before normal one) AddKanaMap (0x30F5, 1); kanaOffset++; } if (cp == 0x30B1) { // add small 'ke' (before normal one) AddKanaMap (0x30F6, 1); kanaOffset++; } if (cp == 0x3061) { // add small 'Tsu' (before normal one) AddKanaMap (0x3063, 1); kanaOffset++; } } fillIndex [0x22] += 3; kanaOffset += 5 * kanaLines [gyo]; } // Wa-gyo is almost special, so I just manually add. AddLetterMap ((char) 0x308E, 0x22, 0); AddLetterMap ((char) (0x308E + 0x60), 0x22, 0); AddLetterMap ((char) 0x308F, 0x22, 0); AddLetterMap ((char) (0x308F + 0x60), 0x22, 0); fillIndex [0x22]++; AddLetterMap ((char) 0x3090, 0x22, 0); AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0); fillIndex [0x22] += 2; // no "Wu" in Japanese. AddLetterMap ((char) 0x3091, 0x22, 0); AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0); fillIndex [0x22]++; AddLetterMap ((char) 0x3092, 0x22, 0); AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0); // Nn fillIndex [0x22] = 0x80; AddLetterMap ((char) 0x3093, 0x22, 0); AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0); map [0x3094] = new CharMapEntry (map [0x30A6].Category, map [0x30A6].Level1, 3);// voiced hiragana U map [0x30F4] = new CharMapEntry (map [0x30A6].Category, map [0x30A6].Level1, 3);// voiced katakana U map [0x30F5] = new CharMapEntry (map [0x30AB].Category, map [0x30AB].Level1, 0);// small katakana Ka map [0x30F6] = new CharMapEntry (map [0x30B1].Category, map [0x30B1].Level1, 0);// small katakana Ke // voiced Wa lines for (int i = 0x30F7; i < 0x30FB; i++) map [i] = new CharMapEntry (map [i - 8].Category, map [i - 8].Level1, 3); // JIS Japanese square chars. fillIndex [0x22] = 0x97; jisJapanese.Sort (JISComparer.Instance); foreach (JISCharacter j in jisJapanese) if (0x3300 <= j.CP && j.CP <= 0x3357) AddCharMap ((char) j.CP, 0x22, 1); // non-JIS Japanese square chars. nonJisJapanese.Sort (NonJISComparer.Instance); foreach (NonJISCharacter j in nonJisJapanese) AddCharMap ((char) j.CP, 0x22, 1); // Bopomofo fillIndex [0x23] = 0x02; for (int i = 0x3105; i <= 0x312C; i++) AddCharMap ((char) i, 0x23, 1); // Estrangela: ancient Syriac fillIndex [0x24] = 0x0B; // FIXME: is 0x71E really alternative form? ArrayList syriacAlternatives = new ArrayList ( new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727}); for (int i = 0x0710; i <= 0x072C; i++) { if (i == 0x0711) // NonSpacingMark continue; if (syriacAlternatives.Contains (i)) continue; AddCharMap ((char) i, 0x24, 4); // FIXME: why? if (i == 0x721) fillIndex [0x24]++; } foreach (int cp in syriacAlternatives) map [cp] = new CharMapEntry (0x24, (byte) (map [cp - 1].Level1 + 2), 0); // FIXME: Syriac NonSpacingMark should go here. // Thaana // FIXME: it turned out that it does not look like UCA fillIndex [0x24] = 0x6E; fillIndex [0x1] = 0xAC; for (int i = 0; i < orderedThaana.Length; i++) { char c = orderedThaana [i]; if (IsIgnorableNonSpacing ((int) c)) AddCharMap (c, 1, 1); AddCharMap (c, 0x24, 2); if (c == '\u0782') // SPECIAL CASE: why? fillIndex [0x24] += 2; } #endregion // FIXME: Add more culture-specific letters (that are // not supported in Windows collation) here. // Surrogate ... they are computed. #region Hangul // Hangul. // // Unlike UCA Windows Hangul sequence mixes Jongseong // with Choseong sequence as well as Jungseong, // adjusted to have the same primary weight for the // same base character. So it is impossible to compute // those sort keys. // // Here I introduce an ordered sequence of mixed // 'commands' and 'characters' that is similar to // LDML text: // - ',' increases primary weight. // - [A B] means a range, increasing index // - {A B} means a range, without increasing index // - '=' is no operation (it means the characters // of both sides have the same weight). // - '>' inserts a Hangul Syllable block that // contains 0x251 characters. // - '<' decreases the index // - '0'-'9' means skip count // - whitespaces are ignored // string hangulSequence = + "\u1100=\u11A8 > \u1101=\u11A9 >" + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >" + "<{\u1113 \u1116}, \u3165," + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8," + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >" + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >" + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1," + "[\u11D1 \u11D2], \u11B2," + "[\u11D3 \u11D5], \u11B3," + "[\u11D6 \u11D7], \u11B4, \u11B5," + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >" + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >" + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >" + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, " + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178," + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>" + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C " + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >" + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB," + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, " + "\u11F1,, \u11F2,,," + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >" + "<\u114D, \u110D,, >" + "<{\u114E \u1151},, \u110E=\u11BE,, >" + "<{\u1152 \u1155},,, \u110F=\u11BF >" + "\u1110=\u11C0 > \u1111=\u11C1 >" + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >" + "<\u1158=\u1159=\u115F, \u3185, \u11F9," + "[\u11F5 \u11F8]" ; byte hangulCat = 0x52; fillIndex [hangulCat] = 0x2; int syllableBlock = 0; for (int n = 0; n < hangulSequence.Length; n++) { char c = hangulSequence [n]; int start, end; if (Char.IsWhiteSpace (c)) continue; switch (c) { case '=': break; // NOP case ',': IncrementSequentialIndex (ref hangulCat); break; case '<': if (fillIndex [hangulCat] == 2) throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate)."); fillIndex [hangulCat]--; break; case '>': IncrementSequentialIndex (ref hangulCat); for (int l = 0; l < 0x15; l++) for (int v = 0; v < 0x1C; v++) { AddCharMap ( (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0); IncrementSequentialIndex (ref hangulCat); } syllableBlock++; break; case '[': start = hangulSequence [n + 1]; end = hangulSequence [n + 3]; for (int i = start; i <= end; i++) { AddCharMap ((char) i, hangulCat, 0); if (end > i) IncrementSequentialIndex (ref hangulCat); } n += 4; // consumes 5 characters for this operation break; case '{': start = hangulSequence [n + 1]; end = hangulSequence [n + 3]; for (int i = start; i <= end; i++) AddCharMap ((char) i, hangulCat, 0); n += 4; // consumes 5 characters for this operation break; default: AddCharMap (c, hangulCat, 0); break; } } // Some Jamo NFKD. for (int i = 0x3200; i < 0x3300; i++) { if (IsIgnorable (i) || map [i].Defined) continue; int ch = 0; // w/ bracket if (decompLength [i] == 4 && decompValues [decompIndex [i]] == '(') ch = decompIndex [i] + 1; // circled else if (decompLength [i] == 2 && decompValues [decompIndex [i] + 1] == '\u1161') ch = decompIndex [i]; else if (decompLength [i] == 1) ch = decompIndex [i]; else continue; ch = decompValues [ch]; if (ch < 0x1100 || 0x1200 < ch && ch < 0xAC00 || 0xD800 < ch) continue; // SPECIAL CASE ? int offset = i < 0x3260 ? 1 : 0; if (0x326E <= i && i <= 0x3273) offset = 1; map [i] = new CharMapEntry (map [ch].Category, (byte) (map [ch].Level1 + offset), map [ch].Level2); // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]); } #endregion // Letterlike characters and CJK compatibility square sortableCharNames.Sort (StringDictionaryValueComparer.Instance); int [] counts = new int ['Z' - 'A' + 1]; char [] namedChars = new char [sortableCharNames.Count]; int nCharNames = 0; foreach (DictionaryEntry de in sortableCharNames) { counts [((string) de.Value) [0] - 'A']++; namedChars [nCharNames++] = (char) ((int) de.Key); } nCharNames = 0; // reset for (int a = 0; a < counts.Length; a++) { fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]); for (int i = 0; i < counts [a]; i++) //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames])); AddCharMap (namedChars [nCharNames++], 0xE, 1); } // CJK unified ideograph. byte cjkCat = 0x9E; fillIndex [cjkCat] = 0x2; for (int cp = 0x4E00; cp <= 0x9FBB; cp++) if (!IsIgnorable (cp)) AddCharMapGroupCJK ((char) cp, ref cjkCat); // CJK Extensions goes here. // LAMESPEC: With this Windows style CJK layout, it is // impossible to add more CJK ideograph i.e. 0x9FA6- // 0x9FBB can never be added w/o breaking compat. for (int cp = 0xF900; cp <= 0xFA2D; cp++) if (!IsIgnorable (cp)) AddCharMapGroupCJK ((char) cp, ref cjkCat); // PrivateUse ... computed. // remaining Surrogate ... computed. #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07 // non-alphanumeric ASCII except for: + - < = > ' for (int i = 0x21; i < 0x7F; i++) { // SPECIAL CASE: 02C6 looks regarded as // equivalent to '^', which does not conform // to Unicode standard character database. if (i == 0x005B) AddCharMap ('\u2045', 0x7, 0, 0x1C); if (i == 0x005D) AddCharMap ('\u2046', 0x7, 0, 0x1C); if (i == 0x005E) AddCharMap ('\u02C6', 0x7, 0, 3); if (i == 0x0060) AddCharMap ('\u02CB', 0x7, 0, 3); if (Char.IsLetterOrDigit ((char) i) || "+-<=>'".IndexOf ((char) i) >= 0) continue; // they are not added here. AddCharMapGroup2 ((char) i, 0x7, 1, 0); // Insert 3001 after ',' and 3002 after '.' if (i == 0x2C) AddCharMapGroup2 ('\u3001', 0x7, 1, 0); else if (i == 0x2E) AddCharMapGroup2 ('\u3002', 0x7, 1, 0); else if (i == 0x3A) AddCharMap ('\uFE30', 0x7, 1, 0); } #endregion #region 07 - Punctuations and something else for (int i = 0xA0; i < char.MaxValue; i++) { if (IsIgnorable (i)) continue; // FIXME: actually those reset should not be // done but here I put for easy goal. if (i == 0x05C3) fillIndex [0x7]++; if (i == 0x0700) fillIndex [0x7] = 0xE2; if (i == 0x2016) fillIndex [0x7] = 0x77; if (i == 0x3008) fillIndex [0x7] = 0x93; if (0x02C8 <= i && i <= 0x02CD) continue; // nonspacing marks // SPECIAL CASE: maybe they could be allocated // dummy NFKD mapping and no special processing // would be required here. if (i == 0x00AF) AddCharMap ('\u02C9', 0x7, 0, 3); if (i == 0x00B4) AddCharMap ('\u02CA', 0x7, 0, 3); if (i == 0x02C7) AddCharMap ('\u02D8', 0x7, 0, 3); // SPECIAL CASES: switch (i) { case 0xAB: // 08 case 0xB7: // 0A case 0xBB: // 08 case 0x02B9: // 01 case 0x02BA: // 01 case 0x2329: // 09 case 0x232A: // 09 continue; } switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.OtherPunctuation: case UnicodeCategory.ClosePunctuation: case UnicodeCategory.OpenPunctuation: case UnicodeCategory.ConnectorPunctuation: case UnicodeCategory.InitialQuotePunctuation: case UnicodeCategory.FinalQuotePunctuation: case UnicodeCategory.ModifierSymbol: // SPECIAL CASES: // 0xA if (0x2020 <= i && i <= 0x2031) continue; if (i == 0x3003) // added later continue; AddCharMapGroup2 ((char) i, 0x7, 1, 0); break; default: if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why? goto case UnicodeCategory.OtherPunctuation; break; } } // Control pictures // FIXME: it should not need to reset level 1, but // it's for easy goal. fillIndex [0x7] = 0xB6; for (int i = 0x2400; i <= 0x2424; i++) AddCharMap ((char) i, 0x7, 1, 0); // FIXME: what are they? AddCharMap ('\u3003', 0x7, 1); AddCharMap ('\u3006', 0x7, 1); AddCharMap ('\u02D0', 0x7, 1); AddCharMap ('\u10FB', 0x7, 1); AddCharMap ('\u0950', 0x7, 1); AddCharMap ('\u093D', 0x7, 1); AddCharMap ('\u0964', 0x7, 1); AddCharMap ('\u0965', 0x7, 1); AddCharMap ('\u0970', 0x7, 1); #endregion #region category 08 - symbols fillIndex [0x8] = 2; // Here Windows mapping is not straightforward. It is // not based on computation but seems manual sorting. AddCharMapGroup ('+', 0x8, 1, 0); // plus AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul AddCharMapGroup ('\u2044', 0x8, 1, 0); // div AddCharMapGroup ('\u2215', 0x8, 1, 0); // div AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus AddCharMapGroup ('\u003C', 0x8, 1, 0); // < AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation for (int cp = 0; cp < 0x2300; cp++) { if (cp == 0xAC) // SPECIAL CASE: skip continue; if (cp == 0x200) { cp = 0x2200; // skip to 2200 fillIndex [0x8] = 0x21; } if (cp == 0x2295) fillIndex [0x8] = 0x3; if (cp == 0x22A2) fillIndex [0x8] = 0xAB; if (cp == 0x22B2) fillIndex [0x8] = 0xB9; if (!map [cp].Defined && // Char.GetUnicodeCategory ((char) cp) == // UnicodeCategory.MathSymbol) Char.IsSymbol ((char) cp)) AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]); // SPECIAL CASES: no idea why Windows sorts as such switch (cp) { case 0x3E: AddCharMap ('\u227B', 0x8, 1, 0); AddCharMap ('\u22B1', 0x8, 1, 0); break; case 0xB1: AddCharMapGroup ('\u00AB', 0x8, 1, 0); AddCharMapGroup ('\u226A', 0x8, 1, 0); AddCharMapGroup ('\u00BB', 0x8, 1, 0); AddCharMapGroup ('\u226B', 0x8, 1, 0); break; case 0xF7: AddCharMap ('\u01C0', 0x8, 1, 0); AddCharMap ('\u01C1', 0x8, 1, 0); AddCharMap ('\u01C2', 0x8, 1, 0); break; } } #endregion #region Hack! // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) continue; if (decompIndex [i] == 0) continue; int start = decompIndex [i]; int primaryChar = decompValues [start]; int secondary = diacritical [i]; bool skip = false; int length = decompLength [i]; // special processing for parenthesized ones. if (length == 3 && decompValues [start] == '(' && decompValues [start + 2] == ')') { primaryChar = decompValues [start + 1]; length = 1; } if (map [primaryChar].Level1 == 0) continue; for (int l = 1; l < length; l++) { int c = decompValues [start + l]; if (map [c].Level1 != 0) skip = true; secondary += diacritical [c]; } if (skip) continue; map [i] = new CharMapEntry ( map [primaryChar].Category, map [primaryChar].Level1, (byte) secondary); } // Diacritical weight adjustment // Arabic Hamzah diacritical [0x624] = 0x5; diacritical [0x626] = 0x7; diacritical [0x622] = 0x9; diacritical [0x623] = 0xA; diacritical [0x625] = 0xB; diacritical [0x649] = 0x5; // 'alif maqs.uurah diacritical [0x64A] = 0x7; // Yaa' for (int i = 0; i < char.MaxValue; i++) { byte mod = 0; byte cat = map [i].Category; switch (cat) { case 0xE: // Latin diacritics case 0x22: // Japanese: circled characters mod = diacritical [i]; break; case 0x13: // Arabic if (diacritical [i] == 0 && i >= 0xFE8D) mod = 0x8; // default for arabic break; } if (0x52 <= cat && cat <= 0x7F) // Hangul mod = diacritical [i]; if (mod > 0) map [i] = new CharMapEntry ( cat, map [i].Level1, mod); } // FIXME: this is halfly hack but those NonSpacingMark // characters and still undefined are likely to // be nonspacing. for (int i = 0; i < char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) continue; switch (i) { // SPECIAL CASES. case 0x02B9: case 0x02BA: break; default: if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) continue; break; } if (diacritical [i] != 0) map [i] = new CharMapEntry (1, 1, diacritical [i]); else AddCharMap ((char) i, 1, 1); } #endregion } private void IncrementSequentialIndex (ref byte hangulCat) { fillIndex [hangulCat]++; if (fillIndex [hangulCat] == 0) { // overflown hangulCat++; fillIndex [hangulCat] = 0x2; } } // Reset fillIndex to fixed value and call AddLetterMap(). private void AddAlphaMap (char c, byte category, byte alphaWeight) { fillIndex [category] = alphaWeight; AddLetterMap (c, category, 0); ArrayList al = latinMap [c] as ArrayList; if (al == null) return; foreach (int cp in al) AddLetterMap ((char) cp, category, 0); } private void AddKanaMap (int i, byte voices) { for (byte b = 0; b < voices; b++) { char c = (char) (i + b); byte arg = (byte) (b > 0 ? b + 2 : 0); // Hiragana AddLetterMapCore (c, 0x22, 0, arg, false); // Katakana AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false); } } private void AddLetterMap (char c, byte category, byte updateCount) { AddLetterMapCore (c, category, updateCount, 0, true); } private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2) { char c2; // updates index c2 = ToSmallForm (c); if (c2 != c) AddCharMapGroup (c2, category, updateCount, level2, deferLevel2); c2 = Char.ToLower (c, CultureInfo.InvariantCulture); if (c2 != c && !map [(int) c2].Defined) AddLetterMapCore (c2, category, 0, level2, deferLevel2); bool doUpdate = true; if (IsIgnorable ((int) c) || map [(int) c].Defined) doUpdate = false; else AddCharMapGroup (c, category, 0, level2, deferLevel2); if (doUpdate) fillIndex [category] += updateCount; } private bool AddCharMap (char c, byte category, byte increment) { return AddCharMap (c, category, increment, 0); } private bool AddCharMap (char c, byte category, byte increment, byte alt) { if (IsIgnorable ((int) c) || map [(int) c].Defined) return false; // do nothing map [(int) c] = new CharMapEntry (category, category == 1 ? alt : fillIndex [category], category == 1 ? fillIndex [category] : alt); fillIndex [category] += increment; return true; } // // Adds characters to table in the order below // (+ increases weight): // ( +) // itself // // | | // | (| ) // + // (vertical +) // // level2 is fixed (does not increase). int [] sameWeightItems = new int [] { DecompositionFraction, DecompositionFull, DecompositionSuper, DecompositionSub, DecompositionCircle, DecompositionWide, DecompositionNarrow, }; private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2) { AddCharMapGroup (c, category, updateCount, level2, false); } private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2) { if (map [(int) c].Defined) return; if (deferLevel2) level2 = diacritical [(int) c]; char small = char.MinValue; char vertical = char.MinValue; Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; if (nfkd != null) { object smv = nfkd [(byte) DecompositionSmall]; if (smv != null) small = (char) ((int) smv); object vv = nfkd [(byte) DecompositionVertical]; if (vv != null) vertical = (char) ((int) vv); } // updates index if (small != char.MinValue) { if (level2 == 0 && deferLevel2) level2 = diacritical [small]; AddCharMap (small, category, updateCount, level2); } // itself AddCharMap (c, category, 0, level2); if (nfkd != null) { foreach (int weight in sameWeightItems) { object wv = nfkd [(byte) weight]; if (wv != null) { if (deferLevel2) level2 = diacritical [(int) wv]; AddCharMap ((char) ((int) wv), category, 0, level2); } } } // update index here. fillIndex [category] += updateCount; if (vertical != char.MinValue) { if (level2 == 0 && deferLevel2) level2 = diacritical [vertical]; AddCharMap (vertical, category, updateCount, level2); } } private void AddCharMapCJK (char c, ref byte category) { AddCharMap (c, category, 0, 0); IncrementSequentialIndex (ref category); // Special. I wonder why but Windows skips 9E F9. if (category == 0x9E && fillIndex [category] == 0xF9) IncrementSequentialIndex (ref category); } private void AddCharMapGroupCJK (char c, ref byte category) { AddCharMapCJK (c, ref category); // LAMESPEC: see below. if (c == '\u5B78') { AddCharMapCJK ('\u32AB', ref category); AddCharMapCJK ('\u323B', ref category); } if (c == '\u52DE') { AddCharMapCJK ('\u3298', ref category); AddCharMapCJK ('\u3238', ref category); } if (c == '\u5BEB') AddCharMapCJK ('\u32A2', ref category); if (c == '\u91AB') // Especially this mapping order totally does // not make sense to me. AddCharMapCJK ('\u32A9', ref category); Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; if (nfkd == null) return; for (byte weight = 0; weight <= 0x12; weight++) { object wv = nfkd [weight]; if (wv == null) continue; int w = (int) wv; // Special: they are ignored in this area. // FIXME: check if it is sane if (0xF900 <= w && w <= 0xFAD9) continue; // LAMESPEC: on Windows some of CJK characters // in 3200-32B0 are incorrectly mapped. They // mix Chinise and Japanese Kanji when // ordering those characters. switch (w) { case 0x32A2: case 0x3298: case 0x3238: case 0x32A9: case 0x323B: case 0x32AB: continue; } AddCharMapCJK ((char) w, ref category); } } // For now it is only for 0x7 category. private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2) { if (map [(int) c].Defined) return; bool updateWeight = false; // Process in advance (lower primary weight) for (int c2 = 0; c2 < char.MaxValue; c2++) { if (!map [c2].Defined && decompLength [c2] == 1 && (int) (decompValues [decompIndex [c2]]) == (int) c) { switch (decompType [c2]) { case DecompositionSmall: updateWeight = true; AddCharMap ((char) c2, category, 0, level2); break; } } } if (updateWeight) fillIndex [category] = (byte) (fillIndex [category] + updateCount); // Identical weight for (int c2 = 0; c2 < char.MaxValue; c2++) { if (!map [c2].Defined && decompLength [c2] == 1 && (int) (decompValues [decompIndex [c2]]) == (int) c) { switch (decompType [c2]) { case DecompositionSub: case DecompositionSuper: case DecompositionWide: case DecompositionNarrow: AddCharMap ((char) c2, category, 0, level2); break; } } } // itself AddCharMap (c, category, updateCount, level2); // Since nfkdMap is problematic to have two or more // NFKD to an identical character, here I iterate all. for (int c2 = 0; c2 < char.MaxValue; c2++) { if (!map [c2].Defined && decompLength [c2] == 1 && (int) (decompValues [decompIndex [c2]]) == (int) c) { switch (decompType [c2]) { case DecompositionWide: case DecompositionNarrow: case DecompositionSmall: case DecompositionSub: case DecompositionSuper: continue; default: AddCharMap ((char) c2, category, updateCount, level2); break; } } } } private void AddArabicCharMap (char c) { byte category = 6; byte updateCount = 1; byte level2 = 0; // itself AddCharMap (c, category, 0, level2); // Since nfkdMap is problematic to have two or more // NFKD to an identical character, here I iterate all. for (int c2 = 0; c2 < char.MaxValue; c2++) { if (decompLength [c2] == 0) continue; int idx = decompIndex [c2] + decompLength [c2] - 1; if ((int) (decompValues [idx]) == (int) c) AddCharMap ((char) c2, category, 0, level2); } fillIndex [category] += updateCount; } char ToSmallForm (char c) { return ToDecomposed (c, DecompositionSmall, false); } char ToDecomposed (char c, byte d, bool tail) { if (decompType [(int) c] != d) return c; int idx = decompIndex [(int) c]; if (tail) idx += decompLength [(int) c] - 1; return (char) decompValues [idx]; } bool ExistsJIS (int cp) { foreach (JISCharacter j in jisJapanese) if (j.CP == cp) return true; return false; } #endregion #region Level 3 properties (Case/Width) private byte ComputeLevel3Weight (char c) { byte b = ComputeLevel3WeightRaw (c); return b > 0 ? (byte) (b + 2) : b; } private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value { // CJK compat if ('\u3192' <= c && c <= '\u319F') return 0; // They have NFKD mapping, and on Windows // those narrow characters are regarded as "normal", // thus those characters themselves are regarded as // "wide". grep "" and you can pick them up // (ignoring Kana, Hangul etc.) switch (c) { case '\u3002': case '\u300C': case '\u300D': case '\u3001': case '\u30FB': case '\u2502': case '\u2190': case '\u2191': case '\u2192': case '\u2193': case '\u25A0': case '\u25CB': return 1; } // Korean if ('\u11A8' <= c && c <= '\u11F9') return 2; if ('\uFFA0' <= c && c <= '\uFFDC') return 4; if ('\u3130' <= c && c <= '\u3164') return 5; if ('\u3165' <= c && c <= '\u318E') return 4; // Georgian Capital letters if ('\u10A0' <= c && c <= '\u10C5') return 0x10; // numbers if ('\u2776' <= c && c <= '\u277F') return 4; if ('\u2780' <= c && c <= '\u2789') return 8; if ('\u2776' <= c && c <= '\u2793') return 0xC; if ('\u2160' <= c && c <= '\u216F') return 0x10; if ('\u2181' <= c && c <= '\u2182') return 0x10; // Arabic if ('\u2135' <= c && c <= '\u2138') return 4; byte [] arabicTmp = new byte [] {0x18, 0, 0x8, 0x10}; if ('\uFEB5' <= c && c < '\uFEED' || '\uFEF1' <= c && c < '\uFEF5') return arabicTmp [c % 4]; if ('\uFE80' <= c && c < '\uFF00') { // 2(Isolated)/8(Final)/0x18(Medial) switch (decompType [(int) c]) { case DecompositionIsolated: return 2; case DecompositionFinal: return 8; case DecompositionMedial: return 0x18; } } // actually I dunno the reason why they have weights. switch (c) { case '\u01BC': return 0x10; case '\u06A9': return 0x20; case '\u06AA': return 0x28; // Gurmukhi case '\u0A39': case '\u0A59': case '\u0A5A': case '\u0A5B': case '\u0A5E': return 0x10; } byte ret = 0; switch (c) { case '\u03C2': case '\u2104': case '\u212B': ret = 8; break; case '\uFE42': ret = 0xA; break; } // misc switch (decompType [(int) c]) { case DecompositionWide: // case DecompositionSub: // case DecompositionSuper: // ret |= decompType [(int) c]; break; } if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL" ret |= 8; if (isUppercase [(int) c]) // DerivedCoreProperties ret |= 0x10; return ret; } #endregion #region IsIgnorable /* static bool IsIgnorable (int i) { if (unicodeAge [i] >= 3.1) return true; switch (char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.OtherNotAssigned: case UnicodeCategory.Format: return true; } return false; } */ // FIXME: In the future use DerivedAge.txt to examine character // versions and set those ones that have higher version than // 1.0 as ignorable. static bool IsIgnorable (int i) { switch (i) { case 0: // I guess, those characters are added between // Unicode 1.0 (LCMapString) and Unicode 3.1 // (UnicodeCategory), so they used to be // something like OtherNotAssigned as of Unicode 1.1. case 0x2df: case 0x387: case 0x3d7: case 0x3d8: case 0x3d9: case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6: case 0x400: case 0x40d: case 0x450: case 0x45d: case 0x587: case 0x58a: case 0x5c4: case 0x640: case 0x653: case 0x654: case 0x655: case 0x66d: case 0xb56: case 0x1e9b: case 0x202f: case 0x20ad: case 0x20ae: case 0x20af: case 0x20e2: case 0x20e3: case 0x2139: case 0x213a: case 0x2183: case 0x2425: case 0x2426: case 0x2619: case 0x2670: case 0x2671: case 0x3007: case 0x3190: case 0x3191: case 0xfffc: case 0xfffd: return true; // exceptional characters filtered by the // following conditions. Originally those exceptional // ranges are incorrect (they should not be ignored) // and most of those characters are unfortunately in // those ranges. case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9: case 0x70F: case 0x3036: case 0x303f: case 0x337b: case 0xfb1e: return false; } if ( // The whole Sinhala characters. 0x0D82 <= i && i <= 0x0DF4 // The whole Tibetan characters. || 0x0F00 <= i && i <= 0x0FD1 // The whole Myanmar characters. || 0x1000 <= i && i <= 0x1059 // The whole Etiopic, Cherokee, // Canadian Syllablic, Ogham, Runic, // Tagalog, Hanunoo, Philippine, // Buhid, Tagbanwa, Khmer and Mongorian // characters. || 0x1200 <= i && i <= 0x1DFF // Greek extension characters. || 0x1F00 <= i && i <= 0x1FFF // The whole Braille characters. || 0x2800 <= i && i <= 0x28FF // CJK radical characters. || 0x2E80 <= i && i <= 0x2EF3 // Kangxi radical characters. || 0x2F00 <= i && i <= 0x2FD5 // Ideographic description characters. || 0x2FF0 <= i && i <= 0x2FFB // Bopomofo letter and final || 0x31A0 <= i && i <= 0x31B7 // White square with quadrant characters. || 0x25F0 <= i && i <= 0x25F7 // Ideographic telegraph symbols. || 0x32C0 <= i && i <= 0x32CB || 0x3358 <= i && i <= 0x3370 || 0x33E0 <= i && i <= 0x33FF // The whole YI characters. || 0xA000 <= i && i <= 0xA48C || 0xA490 <= i && i <= 0xA4C6 // American small ligatures || 0xFB13 <= i && i <= 0xFB17 // hebrew, arabic, variation selector. || 0xFB1D <= i && i <= 0xFE2F // Arabic ligatures. || 0xFEF5 <= i && i <= 0xFEFC // FIXME: why are they excluded? || 0x01F6 <= i && i <= 0x01F9 || 0x0218 <= i && i <= 0x0233 || 0x02A9 <= i && i <= 0x02AD || 0x02EA <= i && i <= 0x02EE || 0x0349 <= i && i <= 0x036F || 0x0488 <= i && i <= 0x048F || 0x04D0 <= i && i <= 0x04FF || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0 || 0x06D6 <= i && i <= 0x06ED || 0x06FA <= i && i <= 0x06FE || 0x2048 <= i && i <= 0x204D || 0x20e4 <= i && i <= 0x20ea || 0x213C <= i && i <= 0x214B || 0x21EB <= i && i <= 0x21FF || 0x22F2 <= i && i <= 0x22FF || 0x237B <= i && i <= 0x239A || 0x239B <= i && i <= 0x23CF || 0x24EB <= i && i <= 0x24FF || 0x2596 <= i && i <= 0x259F || 0x25F8 <= i && i <= 0x25FF || 0x2672 <= i && i <= 0x2689 || 0x2768 <= i && i <= 0x2775 || 0x27d0 <= i && i <= 0x27ff || 0x2900 <= i && i <= 0x2aff || 0x3033 <= i && i <= 0x303F || 0x31F0 <= i && i <= 0x31FF || 0x3250 <= i && i <= 0x325F || 0x32B1 <= i && i <= 0x32BF || 0x3371 <= i && i <= 0x337B || 0xFA30 <= i && i <= 0xFA6A ) return true; UnicodeCategory uc = Char.GetUnicodeCategory ((char) i); switch (uc) { case UnicodeCategory.PrivateUse: case UnicodeCategory.Surrogate: return false; // ignored by nature case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned: return true; default: return false; } } // To check IsIgnorable sanity, try the driver below under MS.NET. /* public static void Main () { for (int i = 0; i <= char.MaxValue; i++) Dump (i, IsIgnorable (i)); } static void Dump (int i, bool ignore) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.PrivateUse: case UnicodeCategory.Surrogate: return; // check nothing } string s1 = ""; string s2 = new string ((char) i, 10); int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase); if ((ret == 0) == ignore) return; Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i)); } */ #endregion // IsIgnorable #region IsIgnorableSymbol static bool IsIgnorableSymbol (int i) { if (IsIgnorable (i)) return true; switch (i) { // *Letter case 0x00b5: case 0x01C0: case 0x01C1: case 0x01C2: case 0x01C3: case 0x01F6: case 0x01F7: case 0x01F8: case 0x01F9: case 0x02D0: case 0x02EE: case 0x037A: case 0x03D7: case 0x03F3: case 0x0400: case 0x040d: case 0x0450: case 0x045d: case 0x048C: case 0x048D: case 0x048E: case 0x048F: case 0x0587: case 0x0640: case 0x06E5: case 0x06E6: case 0x06FA: case 0x06FB: case 0x06FC: case 0x093D: case 0x0950: case 0x1E9B: case 0x2139: case 0x3006: case 0x3033: case 0x3034: case 0x3035: case 0xFE7E: case 0xFE7F: // OtherNumber case 0x16EE: case 0x16EF: case 0x16F0: // LetterNumber case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED case 0x3007: // IDEOGRAPHIC NUMBER ZERO case 0x3038: // HANGZHOU NUMERAL TEN case 0x3039: // HANGZHOU NUMERAL TWENTY case 0x303a: // HANGZHOU NUMERAL THIRTY // OtherSymbol case 0x2117: case 0x327F: return true; // ModifierSymbol case 0x02B9: case 0x02BA: case 0x02C2: case 0x02C3: case 0x02C4: case 0x02C5: case 0x02C8: case 0x02CC: case 0x02CD: case 0x02CE: case 0x02CF: case 0x02D2: case 0x02D3: case 0x02D4: case 0x02D5: case 0x02D6: case 0x02D7: case 0x02DE: case 0x02E5: case 0x02E6: case 0x02E7: case 0x02E8: case 0x02E9: case 0x309B: case 0x309C: // OtherPunctuation case 0x055A: // American Apos case 0x05C0: // Hebrew Punct case 0x0E4F: // Thai FONGMAN case 0x0E5A: // Thai ANGKHANKHU case 0x0E5B: // Thai KHOMUT // CurencySymbol case 0x09F2: // Bengali Rupee Mark case 0x09F3: // Bengali Rupee Sign // MathSymbol case 0x221e: // INF. // OtherSymbol case 0x0482: case 0x09FA: case 0x0B70: return false; } // *Letter if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B #if NET_2_0 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT #endif ) return true; UnicodeCategory uc = Char.GetUnicodeCategory ((char) i); switch (uc) { case UnicodeCategory.Surrogate: return false; // inconsistent case UnicodeCategory.SpacingCombiningMark: case UnicodeCategory.EnclosingMark: case UnicodeCategory.NonSpacingMark: case UnicodeCategory.PrivateUse: // NonSpacingMark if (0x064B <= i && i <= 0x0652) // Arabic return true; return false; case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned: return true; default: bool use = false; // OtherSymbols if ( // latin in a circle 0x249A <= i && i <= 0x24E9 || 0x2100 <= i && i <= 0x2132 // Japanese || 0x3196 <= i && i <= 0x31A0 // Korean || 0x3200 <= i && i <= 0x321C // Chinese/Japanese || 0x322A <= i && i <= 0x3243 // CJK || 0x3260 <= i && i <= 0x32B0 || 0x32D0 <= i && i <= 0x3357 || 0x337B <= i && i <= 0x33DD ) use = !Char.IsLetterOrDigit ((char) i); if (use) return false; // This "Digit" rule is mystery. // It filters some symbols out. if (Char.IsLetterOrDigit ((char) i)) return false; if (Char.IsNumber ((char) i)) return false; if (Char.IsControl ((char) i) || Char.IsSeparator ((char) i) || Char.IsPunctuation ((char) i)) return true; if (Char.IsSymbol ((char) i)) return true; // FIXME: should check more return false; } } // To check IsIgnorableSymbol sanity, try the driver below under MS.NET. /* public static void Main () { CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo; for (int i = 0; i <= char.MaxValue; i++) { UnicodeCategory uc = Char.GetUnicodeCategory ((char) i); if (uc == UnicodeCategory.Surrogate) continue; bool ret = IsIgnorableSymbol (i); string s1 = "TEST "; string s2 = "TEST " + (char) i; int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols); if (ret != (result == 0)) Console.WriteLine ("{0} : {1:x}[{2}]({3})", ret ? "should not ignore" : "should ignore", i,(char) i, uc); } } */ #endregion #region NonSpacing static bool IsIgnorableNonSpacing (int i) { if (IsIgnorable (i)) return true; switch (i) { case 0x02C8: case 0x02DE: case 0x0559: case 0x055A: case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6: case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F: return true; case 0x02D0: case 0x0670: case 0x0901: case 0x0902: case 0x094D: case 0x0962: case 0x0963: case 0x0A41: case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B: case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82: case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC: case 0x0CCD: case 0x0E4E: return false; } if (0x02b9 <= i && i <= 0x02c5 || 0x02cc <= i && i <= 0x02d7 || 0x02e4 <= i && i <= 0x02ef || 0x20DD <= i && i <= 0x20E0 ) return true; if (0x064B <= i && i <= 0x00652 || 0x0941 <= i && i <= 0x0948 || 0x0AC1 <= i && i <= 0x0ACD || 0x0C3E <= i && i <= 0x0C4F || 0x0E31 <= i && i <= 0x0E3F ) return false; return Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark; } // We can reuse IsIgnorableSymbol testcode // for IsIgnorableNonSpacing. #endregion } struct CharMapEntry { public byte Category; public byte Level1; public byte Level2; // It is always single byte. public bool Defined; public CharMapEntry (byte category, byte level1, byte level2) { Category = category; Level1 = level1; Level2 = level2; Defined = true; } } class JISCharacter { public readonly int CP; public readonly int JIS; public JISCharacter (int cp, int cpJIS) { CP = cp; JIS = cpJIS; } } class JISComparer : IComparer { public static readonly JISComparer Instance = new JISComparer (); public int Compare (object o1, object o2) { JISCharacter j1 = (JISCharacter) o1; JISCharacter j2 = (JISCharacter) o2; return j1.JIS - j2.JIS; } } class NonJISCharacter { public readonly int CP; public readonly string Name; public NonJISCharacter (int cp, string name) { CP = cp; Name = name; } } class NonJISComparer : IComparer { public static readonly NonJISComparer Instance = new NonJISComparer (); public int Compare (object o1, object o2) { NonJISCharacter j1 = (NonJISCharacter) o1; NonJISCharacter j2 = (NonJISCharacter) o2; return string.CompareOrdinal (j1.Name, j2.Name); } } class DecimalDictionaryValueComparer : IComparer { public static readonly DecimalDictionaryValueComparer Instance = new DecimalDictionaryValueComparer (); private DecimalDictionaryValueComparer () { } public int Compare (object o1, object o2) { DictionaryEntry e1 = (DictionaryEntry) o1; DictionaryEntry e2 = (DictionaryEntry) o2; // FIXME: in case of 0, compare decomposition categories int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value); if (ret != 0) return ret; int i1 = (int) e1.Key; int i2 = (int) e2.Key; return i1 - i2; } } class StringDictionaryValueComparer : IComparer { public static readonly StringDictionaryValueComparer Instance = new StringDictionaryValueComparer (); private StringDictionaryValueComparer () { } public int Compare (object o1, object o2) { DictionaryEntry e1 = (DictionaryEntry) o1; DictionaryEntry e2 = (DictionaryEntry) o2; int ret = String.Compare ((string) e1.Value, (string) e2.Value); if (ret != 0) return ret; int i1 = (int) e1.Key; int i2 = (int) e2.Key; return i1 - i2; } } class UCAComparer : IComparer { public static readonly UCAComparer Instance = new UCAComparer (); private UCAComparer () { } public int Compare (object o1, object o2) { char i1 = (char) o1; char i2 = (char) o2; int l1 = CollationElementTable.GetSortKeyCount (i1); int l2 = CollationElementTable.GetSortKeyCount (i2); int l = l1 > l2 ? l2 : l1; for (int i = 0; i < l; i++) { SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i); SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i); int v = k1.Primary - k2.Primary; if (v != 0) return v; v = k1.Secondary - k2.Secondary; if (v != 0) return v; v = k1.Thirtiary - k2.Thirtiary; if (v != 0) return v; v = k1.Quarternary - k2.Quarternary; if (v != 0) return v; } return l1 - l2; } } class Tailoring { int lcid; int alias; bool frenchSort; ArrayList items = new ArrayList (); public Tailoring (int lcid) : this (lcid, 0) { } public Tailoring (int lcid, int alias) { this.lcid = lcid; this.alias = alias; } public int LCID { get { return lcid; } } public int Alias { get { return alias; } } public bool FrenchSort { get { return frenchSort; } set { frenchSort = value; } } public void AddDiacriticalMap (byte target, byte replace) { items.Add (new DiacriticalMap (target, replace)); } public void AddSortKeyMap (string source, byte [] sortkey) { items.Add (new SortKeyMap (source, sortkey)); } public void AddReplacementMap (string source, string replace) { items.Add (new ReplacementMap (source, replace)); } public char [] ItemToCharArray () { ArrayList al = new ArrayList (); foreach (ITailoringMap m in items) al.AddRange (m.ToCharArray ()); return al.ToArray (typeof (char)) as char []; } interface ITailoringMap { char [] ToCharArray (); } class DiacriticalMap : ITailoringMap { public readonly byte Target; public readonly byte Replace; public DiacriticalMap (byte target, byte replace) { Target = target; Replace = replace; } public char [] ToCharArray () { char [] ret = new char [3]; ret [0] = (char) 02; // kind:DiacriticalMap ret [1] = (char) Target; ret [2] = (char) Replace; return ret; } } class SortKeyMap : ITailoringMap { public readonly string Source; public readonly byte [] SortKey; public SortKeyMap (string source, byte [] sortkey) { Source = source; SortKey = sortkey; } public char [] ToCharArray () { char [] ret = new char [Source.Length + 7]; ret [0] = (char) 01; // kind:SortKeyMap for (int i = 0; i < Source.Length; i++) ret [i + 1] = Source [i]; // null terminate for (int i = 0; i < 4; i++) ret [i + Source.Length + 2] = (char) SortKey [i]; return ret; } } class ReplacementMap : ITailoringMap { public readonly string Source; public readonly string Replace; public ReplacementMap (string source, string replace) { Source = source; Replace = replace; } public char [] ToCharArray () { char [] ret = new char [Source.Length + Replace.Length + 3]; ret [0] = (char) 03; // kind:ReplaceMap int pos = 1; for (int i = 0; i < Source.Length; i++) ret [pos++] = Source [i]; // null terminate pos++; for (int i = 0; i < Replace.Length; i++) ret [pos++] = Replace [i]; // null terminate return ret; } } } }