// // // There are two kind of sort keys : which are computed and which are laid out // as an indexed array. Computed sort keys are: // // - Surrogate // - PrivateUse // // Also, for composite characters it should prepare different index table. // // Though it is possible to "compute" level 3 weights, they are still dumped // to an array to avoid execution cost. // // // * sortkey getter signature // // int GetSortKey (string s, int index, SortKeyBuffer buf) // Stores sort key for corresponding character element into buf and // returns the length of the consumed _source_ character element in s. // // * character length to consume // // If there are characters whose primary weight is 0, they are consumed // and considered as a part of the character element. // using System; using System.IO; using System.Collections; using System.Globalization; using System.Xml; namespace Mono.Globalization.Unicode { internal class MSCompatSortKeyTableGenerator { public static void Main (string [] args) { new MSCompatSortKeyTableGenerator ().Run (args); } const int DecompositionWide = 1; // fixed const int DecompositionSub = 2; // fixed const int DecompositionSmall = 3; const int DecompositionIsolated = 4; const int DecompositionInitial = 5; const int DecompositionFinal = 6; const int DecompositionMedial = 7; const int DecompositionNoBreak = 8; const int DecompositionVertical = 9; const int DecompositionFraction = 0xA; const int DecompositionFont = 0xB; const int DecompositionSuper = 0xC; // fixed const int DecompositionFull = 0xE; const int DecompositionNarrow = 0xD; const int DecompositionCircle = 0xF; const int DecompositionSquare = 0x10; const int DecompositionCompat = 0x11; const int DecompositionCanonical = 0x12; TextWriter Result = Console.Out; byte [] fillIndex = new byte [256]; // by category CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1]; char [] specialIgnore = new char [] { '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD', '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F' }; // FIXME: need more love (as always) char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\u0292', '\u01BE', '\u0298'}; byte [] alphaWeights = new byte [] { 2, 9, 0xA, 0x1A, 0x21, 0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70, 0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4, 0xA6, 0xA7, 0xA9, 0xAA, 0xB3, 0xB4}; bool [] isSmallCapital = new bool [char.MaxValue + 1]; bool [] isUppercase = new bool [char.MaxValue + 1]; byte [] decompType = new byte [char.MaxValue + 1]; int [] decompIndex = new int [char.MaxValue + 1]; int [] decompLength = new int [char.MaxValue + 1]; int [] decompValues; decimal [] decimalValue = new decimal [char.MaxValue + 1]; byte [] diacritical = new byte [char.MaxValue + 1]; string [] diacritics = new string [] { // LATIN "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;", "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;", " OGONEK;", " CEDILLA;", " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;", " STROKE;", " CIRCUMFLEX AND ACUTE;", " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;", " DIAERESIS AND GRAVE;", " BREVE AND ACUTE;", " CARON AND DOT ABOVE;", " BREVE AND GRAVE;", " MACRON AND ACUTE;", " MACRON AND GRAVE;", " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE", " RING ABOVE AND ACUTE", " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS", " CIRCUMFLEX AND TILDE", " TILDE AND DIAERESIS", " STROKE AND ACUTE", " BREVE AND TILDE", " CEDILLA AND BREVE", " OGONEK AND MACRON", " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", " DOUBLE GRAVE;", " INVERTED BREVE", " PRECEDED BY APOSTROPHE", " HORN;", " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE", " PALATAL HOOK", " DOT BELOW;", " RETROFLEX;", "DIAERESIS BELOW", " RING BELOW", " CIRCUMFLEX BELOW", "HORN AND ACUTE", " BREVE BELOW;", " HORN AND GRAVE", " TILDE BELOW", " DOT BELOW AND DOT ABOVE", " RIGHT HALF RING", " HORN AND TILDE", " CIRCUMFLEX AND DOT BELOW", " BREVE AND DOT BELOW", " DOT BELOW AND MACRON", " HORN AND HOOK ABOVE", " HORN AND DOT", // CIRCLED, PARENTHESIZED and so on "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA", "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN", }; byte [] diacriticWeights = new byte [] { // LATIN. 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x20, 0x21, 0x22, 0x22, 0x23, 0x24, 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30, 0x43, 0x43, 0x43, 0x44, 0x46, 0x48, 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A, 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x69, 0x69, 0x6A, 0x6D, 0x6E, 0x95, 0xAA, // CIRCLED, PARENTHESIZED and so on. 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3 }; int [] numberSecondaryWeightBounds = new int [] { 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970, 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70, 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00, 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70, 0xE50, 0xE60, 0xED0, 0xEE0 }; char [] orderedCyrillic; char [] orderedGurmukhi; char [] orderedGujarati; char [] orderedGeorgian; char [] orderedThaana; static readonly char [] orderedTamilConsonants = new char [] { // based on traditional Tamil consonants, except for // Grantha (where Microsoft breaks traditionalism). // http://www.angelfire.com/empire/thamizh/padanGaL '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7', '\u0BB9'}; // cp -> character name (only for some characters) ArrayList sortableCharNames = new ArrayList (); // cp -> arrow value (int) ArrayList arrowValues = new ArrayList (); // cp -> box value (int) ArrayList boxValues = new ArrayList (); // cp -> level1 value Hashtable arabicLetterPrimaryValues = new Hashtable (); // letterName -> cp Hashtable arabicNameMap = new Hashtable (); // cp -> Hashtable [decompType] -> cp Hashtable nfkdMap = new Hashtable (); // Latin letter -> ArrayList [int] Hashtable latinMap = new Hashtable (); ArrayList jisJapanese = new ArrayList (); ArrayList nonJisJapanese = new ArrayList (); ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00]; ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100]; ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00]; ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00]; byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00]; byte [] ignorableFlags = new byte [char.MaxValue + 1]; double [] unicodeAge = new double [char.MaxValue + 1]; void Run (string [] args) { string dirname = args.Length == 0 ? "downloaded" : args [0]; FillIgnorables (); ParseSources (dirname); Console.Error.WriteLine ("parse done."); ModifyParsedValues (); GenerateCore (); Console.Error.WriteLine ("generation done."); Serialize (); Console.Error.WriteLine ("serialization done."); } void Serialize () { // Ignorables Result.WriteLine ("static byte [] ignorableFlags = new byte [] {"); for (int i = 0; i <= char.MaxValue; i++) { byte value = ignorableFlags [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Primary category Result.WriteLine ("static byte [] categories = new byte [] {"); for (int i = 0; i < map.Length; i++) { byte value = map [i].Category; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Primary weight value Result.WriteLine ("static byte [] level1 = new byte [] {"); for (int i = 0; i < map.Length; i++) { byte value = map [i].Level1; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Secondary weight Result.WriteLine ("static byte [] level2 = new byte [] {"); for (int i = 0; i < map.Length; i++) { int value = map [i].Level2; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Thirtiary weight Result.WriteLine ("static byte [] level3 = new byte [] {"); for (int i = 0; i < map.Length; i++) { byte value = ComputeLevel3Weight ((char) i); if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // Width insensitivity mappings // (for now it is more lightweight than dumping the // entire NFKD table). Result.WriteLine ("static int [] widthCompat = new int [] {"); for (int i = 0; i < char.MaxValue; i++) { int value = 0; switch (decompType [i]) { case DecompositionNarrow: case DecompositionWide: case DecompositionSuper: case DecompositionSub: // they are always 1 char value = decompValues [decompIndex [i]]; break; } if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X04},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF); } Result.WriteLine ("};"); Result.WriteLine (); // CJK SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue); SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0); SerializeCJK ("cjkJA", cjkJA, 0x9FB0); SerializeCJK ("cjkKO", cjkKO, 0x9FB0); SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0); } void SerializeCJK (string name, ushort [] cjk, int max) { int offset = char.MaxValue - cjk.Length; Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name); for (int i = 0; i < cjk.Length; i++) { if (i + offset == max) break; ushort value = cjk [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X04},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF + offset); } Result.WriteLine ("};"); Result.WriteLine (); } void SerializeCJK (string name, byte [] cjk, int max) { int offset = char.MaxValue - cjk.Length; Result.WriteLine ("static byte [] {0} = new byte [] {{", name); for (int i = 0; i < cjk.Length; i++) { if (i + offset == max) break; byte value = cjk [i]; if (value < 10) Result.Write ("{0},", value); else Result.Write ("0x{0:X02},", value); if ((i & 0xF) == 0xF) Result.WriteLine ("// {0:X04}", i - 0xF + offset); } Result.WriteLine ("};"); Result.WriteLine (); } #region Parse void ParseSources (string dirname) { string unidata = dirname + "/UnicodeData.txt"; string derivedCoreProps = dirname + "/DerivedCoreProperties.txt"; string scripts = dirname + "/Scripts.txt"; string cp932 = dirname + "/CP932.TXT"; string derivedAge = dirname + "/DerivedAge.txt"; string chXML = dirname + "/common/collation/zh.xml"; string jaXML = dirname + "/common/collation/ja.xml"; string koXML = dirname + "/common/collation/ko.xml"; ParseDerivedAge (derivedAge); ParseJISOrder (cp932); // in prior to ParseUnidata() ParseUnidata (unidata); ParseDerivedCoreProperties (derivedCoreProps); ParseScripts (scripts); ParseCJK (chXML, jaXML, koXML); } void ParseDerivedAge (string filename) { using (StreamReader file = new StreamReader (filename)) { while (file.Peek () >= 0) { string s = file.ReadLine (); int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) continue; string cpspec = s.Substring (0, idx); idx = cpspec.IndexOf (".."); NumberStyles nf = NumberStyles.HexNumber | NumberStyles.AllowTrailingWhite; int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf); int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf); string value = s.Substring (cpspec.Length + 1).Trim (); // FIXME: use index if (cp > char.MaxValue) continue; for (int i = cp; i <= cpEnd; i++) unicodeAge [i] = double.Parse (value); } } } void ParseUnidata (string filename) { ArrayList decompValues = new ArrayList (); using (StreamReader unidata = new StreamReader (filename)) { for (int line = 1; unidata.Peek () >= 0; line++) { try { ProcessUnidataLine (unidata.ReadLine (), decompValues); } catch (Exception) { Console.Error.WriteLine ("**** At line " + line); throw; } } } this.decompValues = (int []) decompValues.ToArray (typeof (int)); } void ProcessUnidataLine (string s, ArrayList decompValues) { int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) return; int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber); string [] values = s.Substring (idx + 1).Split (';'); // FIXME: use index if (cp > char.MaxValue) return; if (IsIgnorable (cp)) return; string name = values [0]; // isSmallCapital if (s.IndexOf ("SMALL CAPITAL") > 0) isSmallCapital [cp] = true; // latin mapping by character name if (s.IndexOf ("LATIN") > 0) { int lidx = s.IndexOf ("LETTER DOTLESS "); int offset = lidx + 15; if (lidx < 0) { lidx = s.IndexOf ("LETTER TURNED "); offset = lidx + 14; } if (lidx < 0) { lidx = s.IndexOf ("LETTER "); offset = lidx + 7; } char c = lidx > 0 ? s [offset] : char.MinValue; if ('A' <= c && c <= 'Z' && (s.Length == offset + 1 || s [offset + 1] == ' ')) { ArrayList entry = (ArrayList) latinMap [c]; if (entry == null) { entry = new ArrayList (); latinMap [c] = entry; } entry.Add (cp); } } // Arrow names if (0x2000 <= cp && cp < 0x3000) { int value = 0; // SPECIAL CASES. FIXME: why? switch (cp) { case 0x21C5: value = -1; break; // E2 case 0x261D: value = 1; break; case 0x27A6: value = 3; break; case 0x21B0: value = 7; break; case 0x21B1: value = 3; break; case 0x21B2: value = 7; break; case 0x21B4: value = 5; break; case 0x21B5: value = 7; break; case 0x21B9: value = -1; break; // E1 case 0x21CF: value = 7; break; case 0x21D0: value = 3; break; } string [] arrowTargets = new string [] { "", "UPWARDS", "NORTH EAST", "RIGHTWARDS", "SOUTH EAST", "DOWNWARDS", "SOUTH WEST", "LEFTWARDS", "NORTH WEST", }; if (value == 0) for (int i = 1; value == 0 && i < arrowTargets.Length; i++) if (s.IndexOf (arrowTargets [i]) > 0 && s.IndexOf ("BARB " + arrowTargets [i]) < 0 && s.IndexOf (" OVER") < 0 ) value = i; if (value > 0) arrowValues.Add (new DictionaryEntry ( cp, value)); } // Box names if (0x2500 <= cp && cp < 0x25B0) { int value = 0; // flags: // up:1 down:2 right:4 left:8 vert:16 horiz:32 // [h,rl] [r] [l] // [v,ud] [u] [d] // [dr] [dl] [ur] [ul] // [vr,udr] [vl,vdl] // [hd,rld] [hu,rlu] // [hv,udrl,rlv,udh] ArrayList flags = new ArrayList (new int [] { 32, 8 + 4, 8, 4, 16, 1 + 2, 1, 2, 4 + 2, 8 + 2, 4 + 1, 8 + 1, 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8, 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1, 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32 }); byte [] offsets = new byte [] { 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 14, 14}; if (s.IndexOf ("BOX DRAWINGS ") > 0) { int flag = 0; if (s.IndexOf (" UP") > 0) flag |= 1; if (s.IndexOf (" DOWN") > 0) flag |= 2; if (s.IndexOf (" RIGHT") > 0) flag |= 4; if (s.IndexOf (" LEFT") > 0) flag |= 8; if (s.IndexOf (" VERTICAL") > 0) flag |= 16; if (s.IndexOf (" HORIZONTAL") > 0) flag |= 32; int fidx = flags.IndexOf (flag); value = fidx < 0 ? fidx : offsets [fidx]; } else if (s.IndexOf ("BLOCK") > 0) { if (s.IndexOf ("ONE EIGHTH") > 0) value = 0x12; else if (s.IndexOf ("ONE QUARTER") > 0) value = 0x13; else if (s.IndexOf ("THREE EIGHTHS") > 0) value = 0x14; else if (s.IndexOf ("HALF") > 0) value = 0x15; else if (s.IndexOf ("FIVE EIGHTHS") > 0) value = 0x16; else if (s.IndexOf ("THREE QUARTERS") > 0) value = 0x17; else if (s.IndexOf ("SEVEN EIGHTHS") > 0) value = 0x18; else value = 0x19; } if (value >= 0) boxValues.Add (new DictionaryEntry ( cp, value)); } // For some characters store the name and sort later // to determine sorting. if (0x2100 <= cp && cp <= 0x213F && Char.IsSymbol ((char) cp)) sortableCharNames.Add ( new DictionaryEntry (cp, values [0])); else if (0x3380 <= cp && cp <= 0x33DD) sortableCharNames.Add (new DictionaryEntry ( cp, values [0].Substring (7))); // diacritical weights by character name for (int d = 0; d < diacritics.Length; d++) if (s.IndexOf (diacritics [d]) > 0) diacritical [cp] |= diacriticWeights [d]; // Two-step grep required for it. if (s.IndexOf ("FULL STOP") > 0 && (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0)) diacritical [cp] |= 0xF4; // Arabic letter name if (0x0621 <= cp && cp <= 0x064A && Char.GetUnicodeCategory ((char) cp) == UnicodeCategory.OtherLetter) { byte value = (byte) (arabicNameMap.Count * 4 + 0x0B); switch (cp) { case 0x0621: case 0x0624: case 0x0626: // hamza, waw, yeh ... special cases. value = 0x07; break; case 0x0649: case 0x064A: value = 0x77; // special cases. break; default: // Get primary letter name i.e. // XXX part of ARABIC LETTER XXX yyy // e.g. that of "TEH MARBUTA" is "TEH". string letterName = (cp == 0x0640) ? // 0x0640 is special: it does // not start with ARABIC LETTER values [0] : values [0].Substring (14); int tmpIdx = letterName.IndexOf (' '); letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx); //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName); if (arabicNameMap.ContainsKey (letterName)) value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]]; else arabicNameMap [letterName] = cp; break; } arabicLetterPrimaryValues [cp] = value; } // Japanese square letter if (0x3300 <= cp && cp <= 0x3357) if (!ExistsJIS (cp)) nonJisJapanese.Add (new NonJISCharacter (cp, values [0])); // normalizationType string decomp = values [4]; idx = decomp.IndexOf ('<'); if (idx >= 0) { switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) { case "full": decompType [cp] = DecompositionFull; break; case "sub": decompType [cp] = DecompositionSub; break; case "super": decompType [cp] = DecompositionSuper; break; case "small": decompType [cp] = DecompositionSmall; break; case "isolated": decompType [cp] = DecompositionIsolated; break; case "initial": decompType [cp] = DecompositionInitial; break; case "final": decompType [cp] = DecompositionFinal; break; case "medial": decompType [cp] = DecompositionMedial; break; case "noBreak": decompType [cp] = DecompositionNoBreak; break; case "compat": decompType [cp] = DecompositionCompat; break; case "fraction": decompType [cp] = DecompositionFraction; break; case "font": decompType [cp] = DecompositionFont; break; case "circle": decompType [cp] = DecompositionCircle; break; case "square": decompType [cp] = DecompositionSquare; break; case "wide": decompType [cp] = DecompositionWide; break; case "narrow": decompType [cp] = DecompositionNarrow; break; case "vertical": decompType [cp] = DecompositionVertical; break; default: throw new Exception ("Support NFKD type : " + decomp); } } else decompType [cp] = DecompositionCanonical; decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2); if (decomp.Length > 0) { string [] velems = decomp.Split (' '); int didx = decompValues.Count; decompIndex [cp] = didx; foreach (string v in velems) decompValues.Add (int.Parse (v, NumberStyles.HexNumber)); decompLength [cp] = velems.Length; // [decmpType] -> this_cp int targetCP = (int) decompValues [didx]; // for "(x)" it specially maps to 'x' . // FIXME: check if it is sane if (velems.Length == 3 && (int) decompValues [didx] == '(' && (int) decompValues [didx + 2] == ')') targetCP = (int) decompValues [didx + 1]; // special: 0x215F "1/" else if (cp == 0x215F) targetCP = '1'; else if (velems.Length > 1 && (targetCP < 0x4C00 || 0x9FBB < targetCP)) // skip them, except for CJK ideograph compat targetCP = 0; if (targetCP != 0) { Hashtable entry = (Hashtable) nfkdMap [targetCP]; if (entry == null) { entry = new Hashtable (); nfkdMap [targetCP] = entry; } entry [(byte) decompType [cp]] = cp; } } // numeric values if (values [5].Length > 0) decimalValue [cp] = decimal.Parse (values [5]); else if (values [6].Length > 0) decimalValue [cp] = decimal.Parse (values [6]); else if (values [7].Length > 0) { string decstr = values [7]; idx = decstr.IndexOf ('/'); if (cp == 0x215F) // special. "1/" decimalValue [cp] = 0x1; else if (idx > 0) // m/n decimalValue [cp] = decimal.Parse (decstr.Substring (0, idx)) / decimal.Parse (decstr.Substring (idx + 1)); else if (decstr [0] == '(' && decstr [decstr.Length - 1] == ')') // (n) decimalValue [cp] = decimal.Parse (decstr.Substring (1, decstr.Length - 2)); else if (decstr [decstr.Length - 1] == '.') // n. decimalValue [cp] = decimal.Parse (decstr.Substring (0, decstr.Length - 1)); else decimalValue [cp] = decimal.Parse (decstr); } } void ParseDerivedCoreProperties (string filename) { // IsUppercase using (StreamReader file = new StreamReader (filename)) { for (int line = 1; file.Peek () >= 0; line++) { try { ProcessDerivedCorePropLine (file.ReadLine ()); } catch (Exception) { Console.Error.WriteLine ("**** At line " + line); throw; } } } } void ProcessDerivedCorePropLine (string s) { int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) return; string cpspec = s.Substring (0, idx); idx = cpspec.IndexOf (".."); NumberStyles nf = NumberStyles.HexNumber | NumberStyles.AllowTrailingWhite; int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf); int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf); string value = s.Substring (cpspec.Length + 1).Trim (); // FIXME: use index if (cp > char.MaxValue) return; switch (value) { case "Uppercase": for (int x = cp; x <= cpEnd; x++) isUppercase [x] = true; break; } } void ParseScripts (string filename) { ArrayList cyrillic = new ArrayList (); ArrayList gurmukhi = new ArrayList (); ArrayList gujarati = new ArrayList (); ArrayList georgian = new ArrayList (); ArrayList thaana = new ArrayList (); using (StreamReader file = new StreamReader (filename)) { while (file.Peek () >= 0) { string s = file.ReadLine (); int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx); idx = s.IndexOf (';'); if (idx < 0) continue; string cpspec = s.Substring (0, idx); idx = cpspec.IndexOf (".."); NumberStyles nf = NumberStyles.HexNumber | NumberStyles.AllowTrailingWhite; int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf); int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf); string value = s.Substring (cpspec.Length + 1).Trim (); // FIXME: use index if (cp > char.MaxValue) continue; switch (value) { case "Cyrillic": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) cyrillic.Add ((char) x); break; case "Gurmukhi": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) gurmukhi.Add ((char) x); break; case "Gujarati": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) gujarati.Add ((char) x); break; case "Georgian": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) georgian.Add ((char) x); break; case "Thaana": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) thaana.Add ((char) x); break; } } } cyrillic.Sort (UCAComparer.Instance); gurmukhi.Sort (UCAComparer.Instance); gujarati.Sort (UCAComparer.Instance); georgian.Sort (UCAComparer.Instance); thaana.Sort (UCAComparer.Instance); orderedCyrillic = (char []) cyrillic.ToArray (typeof (char)); orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char)); orderedGujarati = (char []) gujarati.ToArray (typeof (char)); orderedGeorgian = (char []) georgian.ToArray (typeof (char)); orderedThaana = (char []) thaana.ToArray (typeof (char)); } void ParseJISOrder (string filename) { using (StreamReader file = new StreamReader (filename)) { while (file.Peek () >= 0) { string s = file.ReadLine (); int idx = s.IndexOf ('#'); if (idx >= 0) s = s.Substring (0, idx).Trim (); if (s.Length == 0) continue; idx = s.IndexOf (' '); if (idx < 0) continue; // They start with "0x" so cut them out. int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber); int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber); jisJapanese.Add (new JISCharacter (cp, jis)); } } } void ParseCJK (string zhXML, string jaXML, string koXML) { XmlDocument doc = new XmlDocument (); doc.XmlResolver = null; int v; string s; string category; int offset; ushort [] arr; // Chinese Simplified category = "chs"; arr = cjkCHS; offset = char.MaxValue - arr.Length; doc.Load (zhXML); s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText; v = 0x8008; foreach (char c in s) { if (c < '\u3100') Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; } } // Chinese Traditional category = "cht"; arr = cjkCHT; offset = char.MaxValue - arr.Length; s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText; v = 0x8002; foreach (char c in s) { if (c < '\u4E00') Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; } } // Japanese category = "ja"; arr = cjkJA; offset = char.MaxValue - arr.Length; doc.Load (jaXML); s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText; v = 0x8008; foreach (char c in s) { if (c < '\u4E00') Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; } } // Korean // Korean weight is somewhat complex. It first shifts // Hangul category from 52-x to 80-x (they are anyways // computed). CJK ideographs are placed at secondary // weight, like XX YY 01 zz 01, where XX and YY are // corresponding "reset" value and zz is 41,43,45... // // Unlike chs,cht and ja, Korean value is a combined // ushort which is computed as category // category = "ko"; arr = cjkKO; offset = char.MaxValue - arr.Length; doc.Load (koXML); foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) { XmlElement sc = (XmlElement) reset.NextSibling; // compute "category" and "level 1" for the // target "reset" Hangle syllable char rc = reset.InnerText [0]; int ri = ((int) rc - 0xAC00) + 1; ushort p = (ushort) ((ri / 254) * 256 + (ri % 254) + 2); // Place the characters after the target. s = sc.InnerText; v = 0x41; foreach (char c in s) { arr [(int) c - offset] = p; cjkKOlv2 [(int) c - offset] = (byte) v; v += 2; } } } #endregion #region Generation void FillIgnorables () { for (int i = 0; i <= char.MaxValue; i++) { if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.OtherNotAssigned) continue; if (IsIgnorable (i)) ignorableFlags [i] |= 1; if (IsIgnorableSymbol (i)) ignorableFlags [i] |= 2; if (IsIgnorableNonSpacing (i)) ignorableFlags [i] |= 4; } } void ModifyParsedValues () { // number, secondary weights byte weight = 0x38; int [] numarr = numberSecondaryWeightBounds; for (int i = 0; i < numarr.Length; i += 2, weight++) for (int cp = numarr [i]; cp < numarr [i + 1]; cp++) if (Char.IsNumber ((char) cp)) diacritical [cp] = weight; // Korean parens numbers for (int i = 0x3200; i <= 0x321C; i++) diacritical [i] = 0xA; for (int i = 0x3260; i <= 0x327B; i++) diacritical [i] = 0xC; // Update name part of named characters for (int i = 0; i < sortableCharNames.Count; i++) { DictionaryEntry de = (DictionaryEntry) sortableCharNames [i]; int cp = (int) de.Key; string renamed = null; switch (cp) { case 0x2101: renamed = "A_1"; break; case 0x33C3: renamed = "A_2"; break; case 0x2105: renamed = "C_1"; break; case 0x2106: renamed = "C_2"; break; case 0x211E: renamed = "R1"; break; case 0x211F: renamed = "R2"; break; // Remove some of them! case 0x2103: case 0x2109: case 0x2116: case 0x2117: case 0x2118: case 0x2125: case 0x2127: case 0x2129: case 0x212E: case 0x2132: sortableCharNames.RemoveAt (i); i--; continue; } if (renamed != null) sortableCharNames [i] = new DictionaryEntry (cp, renamed); } } void GenerateCore () { UnicodeCategory uc; #region Specially ignored // 01 // This will raise "Defined" flag up. foreach (char c in specialIgnore) map [(int) c] = new CharMapEntry (0, 0, 0); #endregion #region Variable weights // Controls : 06 03 - 06 3D fillIndex [6] = 3; for (int i = 0; i < 65536; i++) { if (IsIgnorable (i)) continue; char c = (char) i; uc = Char.GetUnicodeCategory (c); // NEL is whitespace but not ignored here. if (uc == UnicodeCategory.Control && !Char.IsWhiteSpace (c) || c == '\u0085') AddCharMap (c, 6, 1); } // Apostrophe 06 80 fillIndex [6] = 0x80; AddCharMapGroup ('\'', 6, 1, 0); AddCharMap ('\uFE63', 6, 1); // Hyphen/Dash : 06 81 - 06 90 for (int i = 0; i < char.MaxValue; i++) { if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.DashPunctuation) AddCharMapGroupTail ((char) i, 6, 1); } // Arabic variable weight chars 06 A0 - fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) AddCharMapGroupTail ((char) i, 6, 1); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda AddCharMapGroup ('\u0651', 6, 1, 0); #endregion #region Nonspacing marks // 01 // FIXME: 01 03 - 01 B6 ... annoyance :( // Combining diacritical marks: 01 DC - fillIndex [0x1] = 0x41; for (int i = 0x030E; i <= 0x0326; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x0329; i <= 0x0334; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x0339; i <= 0x0341; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0x72; for (int i = 0x0346; i <= 0x0348; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02BE; i <= 0x02BF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02C1; i <= 0x02C5; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02CE; i <= 0x02CF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02D1; i <= 0x02D3; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); AddCharMap ('\u02DE', 0x1, 1); for (int i = 0x02E4; i <= 0x02E9; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); // LAMESPEC: It should not stop at '\u20E1'. There are // a few more characters (that however results in // overflow of level 2 unless we start before 0xDD). fillIndex [0x1] = 0xDC; for (int i = 0x20d0; i <= 0x20e1; i++) AddCharMap ((char) i, 0x1, 1); #endregion #region Whitespaces // 07 03 - fillIndex [0x7] = 0x2; AddCharMap (' ', 0x7, 2); AddCharMap ('\u00A0', 0x7, 1); for (int i = 9; i <= 0xD; i++) AddCharMap ((char) i, 0x7, 1); for (int i = 0x2000; i <= 0x200B; i++) AddCharMap ((char) i, 0x7, 1); fillIndex [0x7] = 0x17; AddCharMapGroup ('\u2028', 0x7, 1, 0); AddCharMapGroup ('\u2029', 0x7, 1, 0); // Characters which used to represent layout control. // LAMESPEC: Windows developers seem to have thought // that those characters are kind of whitespaces, // while they aren't. AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol AddCharMap ('\u2423', 0x7, 1, 0); // open box #endregion // FIXME: 09 should be more complete. fillIndex [0x9] = 2; // misc tech mark for (int cp = 0x2300; cp <= 0x237A; cp++) AddCharMap ((char) cp, 0x9, 1, 0); // arrows byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3}; foreach (DictionaryEntry de in arrowValues) { int idx = (int) de.Value; int cp = (int) de.Key; if (map [cp].Defined) continue; fillIndex [0x9] = (byte) (0xD8 + idx); AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]); arrowLv2 [idx]++; } // boxes byte [] boxLv2 = new byte [128]; for (int i = 0; i < boxLv2.Length; i++) boxLv2 [i] = 3; foreach (DictionaryEntry de in boxValues) { int cp = (int) de.Key; int idx = (int) de.Value; if (map [cp].Defined) continue; fillIndex [0x9] = (byte) (0xE5 + idx); AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]); boxLv2 [idx]++; } // Some special characters (slanted) fillIndex [0x9] = 0xF4; AddCharMap ('\u2571', 0x9, 3); AddCharMap ('\u2572', 0x9, 3); AddCharMap ('\u2573', 0x9, 3); // FIXME: implement 0A #region Symbols fillIndex [0xA] = 2; // byte currency symbols for (int cp = 0; cp < 0x100; cp++) { uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.CurrencySymbol && cp != '$') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // byte other symbols for (int cp = 0; cp < 0x100; cp++) { if (cp == 0xA6) continue; // SPECIAL: skip FIXME: why? uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.OtherSymbol) AddCharMapGroup ((char) cp, 0xA, 1, 0); } fillIndex [0xA] = 0x2F; // FIXME: it won't be needed for (int cp = 0x2600; cp <= 0x2613; cp++) AddCharMap ((char) cp, 0xA, 1, 0); // Dingbats for (int cp = 0x2620; cp <= 0x2770; cp++) if (Char.IsSymbol ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // OCR for (int i = 0x2440; i < 0x2460; i++) AddCharMap ((char) i, 0xA, 1, 0); #endregion #region Numbers // 0C 02 - 0C E1 fillIndex [0xC] = 2; // 9F8 : Bengali "one less than the denominator" AddCharMap ('\u09F8', 0xC, 1); ArrayList numbers = new ArrayList (); for (int i = 0; i < 65536; i++) if (!IsIgnorable (i) && Char.IsNumber ((char) i) && (i < 0x3190 || 0x32C0 < i)) // they are CJK characters numbers.Add (i); ArrayList numberValues = new ArrayList (); foreach (int i in numbers) numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i])); numberValues.Sort (DecimalDictionaryValueComparer.Instance); //foreach (DictionaryEntry de in numberValues) //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]); decimal prevValue = -1; foreach (DictionaryEntry de in numberValues) { int cp = (int) de.Key; decimal currValue = (decimal) de.Value; bool addnew = false; if (prevValue < currValue && prevValue - (int) prevValue == 0 && prevValue >= 1) { addnew = true; // Process Hangzhou and Roman numbers // There are some SPECIAL cases. if (currValue != 4) // no increment for 4 fillIndex [0xC]++; int xcp; xcp = (int) prevValue + 0x2170 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = (int) prevValue + 0x2160 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC] += 2; xcp = (int) prevValue + 0x3021 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; } if (prevValue < currValue) prevValue = currValue; if (map [cp].Defined) continue; // HangZhou and Roman are add later // (code is above) else if (0x3021 <= cp && cp < 0x302A || 0x2160 <= cp && cp < 0x216A || 0x2170 <= cp && cp < 0x217A) continue; if (cp == 0x215B) // FIXME: why? fillIndex [0xC] += 2; else if (cp == 0x3021) // FIXME: why? fillIndex [0xC]++; AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]); if (addnew || cp <= '9') { int xcp; if (1 <= currValue && currValue <= 10) { xcp = cp - 0x31 + 0x2776; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = cp - 0x31 + 0x2780; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = cp - 0x31 + 0x278A; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } if (1 <= currValue && currValue <= 20) { xcp = cp - 0x31 + 0x2460; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = cp - 0x31 + 0x2474; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = cp - 0x31 + 0x2488; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } } if (cp != 0x09E7 && cp != 0x09EA) fillIndex [0xC]++; // Add special cases that are not regarded as // numbers in UnicodeCategory speak. if (cp == '5') { // TONE FIVE AddCharMapGroup ('\u01BD', 0xC, 0, 0); AddCharMapGroup ('\u01BC', 0xC, 1, 0); } else if (cp == '6') // FIXME: why? fillIndex [0xC]++; } // 221E: infinity fillIndex [0xC] = 0xFF; AddCharMap ('\u221E', 0xC, 1); #endregion #region Letters and NonSpacing Marks (general) // ASCII Latin alphabets for (int i = 0; i < alphabets.Length; i++) AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); // non-ASCII Latin alphabets // FIXME: there is no such characters that are placed // *after* "alphabets" array items. This is nothing // more than a hack that creates dummy weight for // primary characters. for (int i = 0x0080; i < 0x0300; i++) { if (!Char.IsLetter ((char) i)) continue; // For those Latin Letters which has NFKD are // not added as independent primary character. if (decompIndex [i] != 0) continue; // SPECIAL CASES: // 1.some alphabets have primarily // equivalent ASCII alphabets. // 2.some have independent primary weights, // but inside a-to-z range. // 3.there are some expanded characters that // are not part of Unicode Standard NFKD. switch (i) { // 1. skipping them does not make sense // case 0xD0: case 0xF0: case 0x131: case 0x138: // case 0x184: case 0x185: case 0x186: case 0x189: // case 0x18D: case 0x18E: case 0x18F: case 0x190: // case 0x194: case 0x195: case 0x196: case 0x19A: // case 0x19B: case 0x19C: // 2. skipping them does not make sense // case 0x14A: // Ng // case 0x14B: // ng // 3. case 0xC6: // AE case 0xE6: // ae case 0xDE: // Icelandic Thorn case 0xFE: // Icelandic Thorn case 0xDF: // German ss case 0xFF: // German ss // not classified yet // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9: // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8: // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF: // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // case 0x1DD: continue; } AddCharMapGroup ((char) i, 0xE, 1, 0); } // Greek and Coptic fillIndex [0xF] = 02; for (int i = 0x0380; i < 0x0390; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); fillIndex [0xF] = 02; for (int i = 0x0391; i < 0x03CF; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); fillIndex [0xF] = 0x40; for (int i = 0x03D0; i < 0x0400; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0xF, 1); // Cyrillic - UCA order w/ some modification fillIndex [0x10] = 0x3; // table which is moslty from UCA DUCET. for (int i = 0; i < orderedCyrillic.Length; i++) { char c = orderedCyrillic [i]; if (Char.IsLetter (c)) AddLetterMap (c, 0x10, 3); } for (int i = 0x0460; i < 0x0481; i++) { if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x10, 3); } // Armenian fillIndex [0x11] = 0x3; for (int i = 0x0531; i < 0x0586; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x11, 1); // Hebrew // -Letters fillIndex [0x12] = 0x3; for (int i = 0x05D0; i < 0x05FF; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x12, 1); // -Accents fillIndex [0x1] = 0x3; for (int i = 0x0591; i <= 0x05C2; i++) if (i != 0x05BE) AddCharMap ((char) i, 0x1, 1); // Arabic fillIndex [0x1] = 0x8E; fillIndex [0x13] = 0x3; for (int i = 0x0621; i <= 0x064A; i++) { // Abjad if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.OtherLetter) { // FIXME: arabic nonspacing marks are // in different order. AddCharMap ((char) i, 0x1, 1); continue; } // map [i] = new CharMapEntry (0x13, // (byte) arabicLetterPrimaryValues [i], 1); fillIndex [0x13] = (byte) arabicLetterPrimaryValues [i]; AddLetterMap ((char) i, 0x13, 0); } fillIndex [0x13] = 0x84; for (int i = 0x0674; i < 0x06D6; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x13, 1); // Devanagari // FIXME: it does seem straight codepoint mapping. fillIndex [0x14] = 04; for (int i = 0x0901; i < 0x0905; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xB; for (int i = 0x0905; i < 0x093A; i++) if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x14, 4); for (int i = 0x093E; i < 0x094F; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); // Bengali // -Letters fillIndex [0x15] = 02; for (int i = 0x0980; i < 0x9FF; i++) { if (IsIgnorable (i)) continue; if (i == 0x09E0) fillIndex [0x15] = 0x3B; switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: case UnicodeCategory.OtherNumber: continue; } AddLetterMap ((char) i, 0x15, 1); } // -Signs fillIndex [0x1] = 0x3; for (int i = 0x0981; i < 0x0A00; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 0x1, 1); // Gurmukhi. orderedGurmukhi is from UCA // FIXME: it does not look equivalent to UCA. fillIndex [0x1] = 03; fillIndex [0x16] = 02; for (int i = 0; i < orderedGurmukhi.Length; i++) { char c = orderedGurmukhi [i]; if (IsIgnorable ((int) c)) continue; if (!Char.IsLetter (c)) { AddLetterMap (c, 0x1, 1); continue; } if (c == '\u0A3C' || c == '\u0A4D' || '\u0A66' <= c && c <= '\u0A71') continue; AddLetterMap (c, 0x16, 4); } // Gujarati. orderedGujarati is from UCA fillIndex [0x17] = 02; for (int i = 0; i < orderedGujarati.Length; i++) AddLetterMap (orderedGujarati [i], 0x17, 4); // Oriya fillIndex [0x18] = 02; for (int i = 0x0B00; i < 0x0B7F; i++) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: continue; } AddLetterMap ((char) i, 0x18, 1); } // Tamil fillIndex [0x19] = 2; AddCharMap ('\u0BD7', 0x19, 0); fillIndex [0x19] = 0xA; // vowels for (int i = 0x0BD7; i < 0x0B94; i++) if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x19, 2); // special vowel fillIndex [0x19] = 0x24; AddCharMap ('\u0B94', 0x19, 0); fillIndex [0x19] = 0x26; // The array for Tamil consonants is a constant. // Windows have almost similar sequence to TAM from // tamilnet but a bit different in Grantha. for (int i = 0; i < orderedTamilConsonants.Length; i++) AddLetterMap (orderedTamilConsonants [i], 0x19, 4); // combining marks fillIndex [0x19] = 0x82; for (int i = 0x0BBE; i < 0x0BCD; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.SpacingCombiningMark || i == 0x0BC0) AddLetterMap ((char) i, 0x19, 2); // Telugu fillIndex [0x1A] = 0x4; for (int i = 0x0C00; i < 0x0C62; i++) { if (i == 0x0C55 || i == 0x0C56) continue; // skip AddCharMap ((char) i, 0x1A, 3); char supp = (i == 0x0C0B) ? '\u0C60': i == 0x0C0C ? '\u0C61' : char.MinValue; if (supp == char.MinValue) continue; AddCharMap (supp, 0x1A, 3); } // Kannada fillIndex [0x1B] = 4; for (int i = 0x0C80; i < 0x0CE5; i++) { if (i == 0x0CD5 || i == 0x0CD6) continue; // ignore AddCharMap ((char) i, 0x1B, 3); } // Malayalam fillIndex [0x1C] = 2; for (int i = 0x0D02; i < 0x0D61; i++) // FIXME: I avoided MSCompatUnicodeTable usage // here (it results in recursion). So check if // using NonSpacingMark makes sense or not. if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) // if (!MSCompatUnicodeTable.IsIgnorable ((char) i)) AddCharMap ((char) i, 0x1C, 1); // Thai ... note that it breaks 0x1E wall after E2B! // Also, all Thai characters have level 2 value 3. fillIndex [0x1E] = 2; for (int i = 0xE44; i < 0xE48; i++) AddCharMap ((char) i, 0x1E, 1, 3); for (int i = 0xE01; i < 0xE2B; i++) AddCharMap ((char) i, 0x1E, 6, 0); fillIndex [0x1F] = 5; for (int i = 0xE2B; i < 0xE30; i++) AddCharMap ((char) i, 0x1F, 6, 0); for (int i = 0xE30; i < 0xE3B; i++) AddCharMap ((char) i, 0x1F, 1, 3); // some Thai characters remains. char [] specialThai = new char [] {'\u0E45', '\u0E46', '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'}; foreach (char c in specialThai) AddCharMap (c, 0x1F, 1); // Lao fillIndex [0x1F] = 2; for (int i = 0xE80; i < 0xEDF; i++) if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x1F, 1); // Georgian. orderedGeorgian is from UCA DUCET. fillIndex [0x21] = 5; for (int i = 0; i < orderedGeorgian.Length; i++) AddLetterMap (orderedGeorgian [i], 0x21, 5); // Japanese Kana. fillIndex [0x22] = 2; int kanaOffset = 0x3041; byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1}; for (int gyo = 0; gyo < 9; gyo++) { for (int dan = 0; dan < 5; dan++) { if (gyo == 7 && dan % 2 == 1) { // 'ya'-gyo fillIndex [0x22]++; kanaOffset -= 2; // There is no space for yi and ye. continue; } int cp = kanaOffset + dan * kanaLines [gyo]; // small lines (a-gyo, ya-gyo) if (gyo == 0 || gyo == 7) { AddKanaMap (cp, 1); // small AddKanaMap (cp + 1, 1); } else AddKanaMap (cp, kanaLines [gyo]); fillIndex [0x22]++; if (cp == 0x3061) { // add small 'Tsu' (before normal one) AddKanaMap (0x3063, 1); kanaOffset++; } } fillIndex [0x22] += 3; kanaOffset += 5 * kanaLines [gyo]; } // Wa-gyo is almost special, so I just manually add. AddLetterMap ((char) 0x308E, 0x22, 0); AddLetterMap ((char) (0x308E + 0x60), 0x22, 0); AddLetterMap ((char) 0x308F, 0x22, 0); AddLetterMap ((char) (0x308F + 0x60), 0x22, 0); fillIndex [0x22]++; AddLetterMap ((char) 0x3090, 0x22, 0); AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0); fillIndex [0x22] += 2; // no "Wu" in Japanese. AddLetterMap ((char) 0x3091, 0x22, 0); AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0); fillIndex [0x22]++; AddLetterMap ((char) 0x3092, 0x22, 0); AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0); // Nn fillIndex [0x22] = 0x80; AddLetterMap ((char) 0x3093, 0x22, 0); AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0); // JIS Japanese square chars. fillIndex [0x22] = 0x97; jisJapanese.Sort (JISComparer.Instance); foreach (JISCharacter j in jisJapanese) AddCharMap ((char) j.CP, 0x22, 1); // non-JIS Japanese square chars. nonJisJapanese.Sort (NonJISComparer.Instance); foreach (NonJISCharacter j in nonJisJapanese) AddCharMap ((char) j.CP, 0x22, 1); // Bopomofo fillIndex [0x23] = 0x02; for (int i = 0x3105; i <= 0x312C; i++) AddCharMap ((char) i, 0x23, 1); // Estrangela: ancient Syriac fillIndex [0x24] = 0x0B; // FIXME: is 0x71E really alternative form? ArrayList syriacAlternatives = new ArrayList ( new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727}); for (int i = 0x0710; i <= 0x072C; i++) { if (i == 0x0711) // NonSpacingMark continue; if (syriacAlternatives.Contains (i)) continue; AddCharMap ((char) i, 0x24, 4); // FIXME: why? if (i == 0x721) fillIndex [0x24]++; } foreach (int cp in syriacAlternatives) map [cp] = new CharMapEntry (0x24, (byte) (map [cp - 1].Level1 + 2), 0); // Thaana // FIXME: it turned out that it does not look like UCA fillIndex [0x24] = 0x6E; for (int i = 0; i < orderedThaana.Length; i++) { if (IsIgnorableNonSpacing (i)) continue; AddCharMap (orderedThaana [i], 0x24, 2); } #endregion // FIXME: Add more culture-specific letters (that are // not supported in Windows collation) here. // Surrogate ... they are computed. #region Hangul // Hangul. // // Unlike UCA Windows Hangul sequence mixes Jongseong // with Choseong sequence as well as Jungseong, // adjusted to have the same primary weight for the // same base character. So it is impossible to compute // those sort keys. // // Here I introduce an ordered sequence of mixed // 'commands' and 'characters' that is similar to // LDML text: // - ',' increases primary weight. // - [A B] means a range, increasing index // - {A B} means a range, without increasing index // - '=' is no operation (it means the characters // of both sides have the same weight). // - '>' inserts a Hangul Syllable block that // contains 0x251 characters. // - '<' decreases the index // - '0'-'9' means skip count // - whitespaces are ignored // string hangulSequence = + "\u1100=\u11A8 > \u1101=\u11A9 >" + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >" + "<{\u1113 \u1116}, \u3165," + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8," + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >" + "\u11CA, \u1104, \u11CB > \u1105 >" + "\u11B0, [\u11CC \u11D0], \u11B1, [\u11D1 \u11D2]," + "\u11B2, [\u11D3 \u11D5], \u11B3," + "[\u11D6 \u11D7], \u11B4, \u11B5," + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >" + "[\u11DA \u11E2], \u1107=\u11B8 >" + "<{\u111E \u1120}, \u3172,, \u3173, " + "\u11E3, \u1108 >" + "\u11B9,,,,,,,,, [\u11E4 \u11E6],, \u1109=\u11BA,,," + "\u3214=\u3274 <>" + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],," + "\u11EA,, \u110A=\u11BB,,, >" + "{\u1134 \u1140}, \u317E,,,,,, \u11EB," + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" + "\u11EE, \u11EC, \u11ED,,,,, \u11F1,, \u11F2,,," + "\u11EF,,, \u11F0, \u110C=\u11BD,, >" + "\u110D,, >" + "<{\u114E \u1151},, \u110E=\u11BE,, >" + "<{\u1152 \u1155},,, \u110F=\u11BF >" + "\u1110=\u11C0 > \u1111=\u11C1 >" + "\u11F3, \u11F4, \u1112=\u11C2 >" + "\u11F9, [\u11F5 \u11F8]" ; byte hangulCat = 0x52; fillIndex [hangulCat] = 0x2; int syllableBlock = 0; for (int n = 0; n < hangulSequence.Length; n++) { char c = hangulSequence [n]; int start, end; if (Char.IsWhiteSpace (c)) continue; switch (c) { case '=': break; // NOP case ',': IncrementSequentialIndex (ref hangulCat); break; case '<': if (fillIndex [hangulCat] == 2) throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate)."); fillIndex [hangulCat]--; break; case '>': IncrementSequentialIndex (ref hangulCat); for (int l = 0; l < 0x15; l++) for (int v = 0; v < 0x1C; v++) { AddCharMap ( (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0); IncrementSequentialIndex (ref hangulCat); } syllableBlock++; break; case '[': start = hangulSequence [n + 1]; end = hangulSequence [n + 3]; for (int i = start; i <= end; i++) { AddCharMap ((char) i, hangulCat, 0); if (end > i) IncrementSequentialIndex (ref hangulCat); } n += 4; // consumes 5 characters for this operation break; case '{': start = hangulSequence [n + 1]; end = hangulSequence [n + 3]; for (int i = start; i <= end; i++) AddCharMap ((char) i, hangulCat, 0); n += 4; // consumes 5 characters for this operation break; default: AddCharMap (c, hangulCat, 0); break; } } #endregion // Letterlike characters and CJK compatibility square sortableCharNames.Sort (StringDictionaryValueComparer.Instance); int [] counts = new int ['Z' - 'A' + 1]; char [] namedChars = new char [sortableCharNames.Count]; int nCharNames = 0; foreach (DictionaryEntry de in sortableCharNames) { counts [((string) de.Value) [0] - 'A']++; namedChars [nCharNames++] = (char) ((int) de.Key); } nCharNames = 0; // reset for (int a = 0; a < counts.Length; a++) { fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]); for (int i = 0; i < counts [a]; i++) //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames])); AddCharMap (namedChars [nCharNames++], 0xE, 1); } // CJK unified ideograph. byte cjkCat = 0x9E; fillIndex [cjkCat] = 0x2; for (int cp = 0x4E00; cp <= 0x9FBB; cp++) if (!IsIgnorable (cp)) AddCharMapGroupCJK ((char) cp, ref cjkCat); // CJK Extensions goes here. // LAMESPEC: With this Windows style CJK layout, it is // impossible to add more CJK ideograph i.e. 0x9FA6- // 0x9FBB can never be added w/o breaking compat. for (int cp = 0xF900; cp <= 0xFA2D; cp++) if (!IsIgnorable (cp)) AddCharMapGroupCJK ((char) cp, ref cjkCat); // PrivateUse ... computed. // remaining Surrogate ... computed. #region Special "biggest" area (FF FF) fillIndex [0xFF] = 0xFF; char [] specialBiggest = new char [] { '\u3005', '\u3031', '\u3032', '\u309D', '\u309E', '\u30FC', '\u30FD', '\u30FE', '\uFE7C', '\uFE7D', '\uFF70'}; foreach (char c in specialBiggest) AddCharMap (c, 0xFF, 0); #endregion #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07 // non-alphanumeric ASCII except for: + - < = > ' for (int i = 0x21; i < 0x7F; i++) { if (Char.IsLetterOrDigit ((char) i) || "+-<=>'".IndexOf ((char) i) >= 0) continue; // they are not added here. AddCharMapGroup2 ((char) i, 0x7, 1, 0); // Insert 3001 after ',' and 3002 after '.' if (i == 0x2C) AddCharMapGroup2 ('\u3001', 0x7, 1, 0); else if (i == 0x2E) { fillIndex [0x7]--; AddCharMapGroup2 ('\u3002', 0x7, 1, 0); } else if (i == 0x3A) AddCharMap ('\uFE30', 0x7, 1, 0); } #endregion #region 07 - Punctuations and something else for (int i = 0xA0; i < char.MaxValue; i++) { if (IsIgnorable (i)) continue; // SPECIAL CASES: switch (i) { case 0xAB: // 08 case 0xB7: // 0A case 0x2329: // 09 case 0x232A: // 09 continue; } switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.OtherPunctuation: case UnicodeCategory.ClosePunctuation: case UnicodeCategory.OpenPunctuation: case UnicodeCategory.InitialQuotePunctuation: case UnicodeCategory.FinalQuotePunctuation: case UnicodeCategory.ModifierSymbol: // SPECIAL CASES: // 0xA if (0x2020 <= i && i <= 0x2042) continue; AddCharMapGroup ((char) i, 0x7, 1, 0); break; default: if (i == 0xA6) // SPECIAL CASE. FIXME: why? goto case UnicodeCategory.OtherPunctuation; break; } } // Control pictures for (int i = 0x2400; i <= 0x2421; i++) AddCharMap ((char) i, 0x7, 1, 0); #endregion // FIXME: for 07 xx we need more love. // FIXME: 08 should be more complete. fillIndex [0x8] = 2; for (int cp = 0; cp < char.MaxValue; cp++) if (!map [cp].Defined && Char.GetUnicodeCategory ((char) cp) == UnicodeCategory.MathSymbol) AddCharMapGroup ((char) cp, 0x8, 1, 0); // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) continue; if (decompIndex [i] == 0) continue; int start = decompIndex [i]; int primaryChar = decompValues [start]; int secondary = 0; bool skip = false; int length = decompLength [i]; // special processing for parenthesized ones. if (length == 3 && decompValues [start] == '(' && decompValues [start + 2] == ')') { primaryChar = decompValues [start + 1]; length = 1; } if (map [primaryChar].Level1 == 0) continue; for (int l = 1; l < length; l++) { int c = decompValues [start + l]; if (map [c].Level1 != 0) skip = true; secondary += diacritical [c]; } if (skip) continue; map [i] = new CharMapEntry ( map [primaryChar].Category, map [primaryChar].Level1, (byte) secondary); } #region Level2 adjustment // Arabic Hamzah diacritical [0x624] = 0x5; diacritical [0x626] = 0x7; diacritical [0x622] = 0x9; diacritical [0x623] = 0xA; diacritical [0x625] = 0xB; diacritical [0x649] = 0x5; // 'alif maqs.uurah diacritical [0x64A] = 0x7; // Yaa' for (int i = 0; i < char.MaxValue; i++) { byte mod = 0; byte cat = map [i].Category; switch (cat) { case 0xE: // Latin diacritics case 0x22: // Japanese: circled characters mod = diacritical [i]; break; case 0x13: // Arabic if (diacritical [i] == 0) mod = 0x8; // default for arabic break; } if (0x52 <= cat && cat <= 0x7F) // Hangul mod = diacritical [i]; if (mod > 0) map [i] = new CharMapEntry ( cat, map [i].Level1, mod); } #endregion // FIXME: this is hack but those which are // NonSpacingMark characters and still undefined // are likely to be nonspacing. for (int i = 0; i < char.MaxValue; i++) if (!map [i].Defined && !IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); } private void IncrementSequentialIndex (ref byte hangulCat) { fillIndex [hangulCat]++; if (fillIndex [hangulCat] == 0) { // overflown hangulCat++; fillIndex [hangulCat] = 0x2; } } // Reset fillIndex to fixed value and call AddLetterMap(). private void AddAlphaMap (char c, byte category, byte alphaWeight) { fillIndex [category] = alphaWeight; AddLetterMap (c, category, 0); ArrayList al = latinMap [c] as ArrayList; if (al == null) return; foreach (int cp in al) AddLetterMap ((char) cp, category, 0); } private void AddKanaMap (int i, byte voices) { for (byte b = 0; b < voices; b++) { char c = (char) (i + b); byte arg = (byte) (b > 0 ? b + 2 : 0); // Hiragana AddLetterMapCore (c, 0x22, 0, arg); // Katakana AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg); } } private void AddLetterMap (char c, byte category, byte updateCount) { AddLetterMapCore (c, category, updateCount, 0); } private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2) { char c2; // updates index c2 = ToSmallForm (c); if (c2 != c) AddCharMapGroup (c2, category, updateCount, level2); c2 = Char.ToLower (c, CultureInfo.InvariantCulture); if (c2 != c && !map [(int) c2].Defined) AddLetterMapCore (c2, category, 0, level2); bool doUpdate = true; if (IsIgnorable ((int) c) || map [(int) c].Defined) doUpdate = false; else AddCharMapGroup (c, category, 0, level2); if (doUpdate) fillIndex [category] += updateCount; } private bool AddCharMap (char c, byte category, byte increment) { return AddCharMap (c, category, increment, 0); } private bool AddCharMap (char c, byte category, byte increment, byte alt) { if (IsIgnorable ((int) c) || map [(int) c].Defined) return false; // do nothing map [(int) c] = new CharMapEntry (category, category == 1 ? alt : fillIndex [category], category == 1 ? fillIndex [category] : alt); fillIndex [category] += increment; return true; } private void AddCharMapGroupTail (char c, byte category, byte updateCount) { char c2 = ToSmallFormTail (c); if (c2 != c) AddCharMap (c2, category, updateCount, 0); // itself AddCharMap (c, category, updateCount, 0); // c2 = ToFullWidthTail (c); if (c2 != c) AddCharMapGroupTail (c2, category, updateCount); } // // Adds characters to table in the order below // (+ increases weight): // ( +) // itself // // | | // | (| ) // + // (vertical +) // // level2 is fixed (does not increase). int [] sameWeightItems = new int [] { DecompositionFraction, DecompositionFull, DecompositionSuper, DecompositionSub, DecompositionCircle, DecompositionWide, DecompositionNarrow, }; private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2) { if (map [(int) c].Defined) return; char small = char.MinValue; char vertical = char.MinValue; Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; if (nfkd != null) { object smv = nfkd [(byte) DecompositionSmall]; if (smv != null) small = (char) ((int) smv); object vv = nfkd [(byte) DecompositionVertical]; if (vv != null) vertical = (char) ((int) vv); } // updates index if (small != char.MinValue) AddCharMap (small, category, updateCount); // itself AddCharMap (c, category, 0, level2); if (nfkd != null) { foreach (int weight in sameWeightItems) { object wv = nfkd [(byte) weight]; if (wv != null) AddCharMap ((char) ((int) wv), category, 0, level2); } } // update index here. fillIndex [category] += updateCount; if (vertical != char.MinValue) AddCharMap (vertical, category, updateCount, level2); } private void AddCharMapCJK (char c, ref byte category) { AddCharMap (c, category, 0, 0); IncrementSequentialIndex (ref category); // Special. I wonder why but Windows skips 9E F9. if (category == 0x9E && fillIndex [category] == 0xF9) IncrementSequentialIndex (ref category); } private void AddCharMapGroupCJK (char c, ref byte category) { AddCharMapCJK (c, ref category); // LAMESPEC: see below. if (c == '\u52DE') { AddCharMapCJK ('\u3298', ref category); AddCharMapCJK ('\u3238', ref category); } if (c == '\u5BEB') AddCharMapCJK ('\u32A2', ref category); if (c == '\u91AB') // Especially this mapping order totally does // not make sense to me. AddCharMapCJK ('\u32A9', ref category); Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; if (nfkd == null) return; for (byte weight = 0; weight <= 0x12; weight++) { object wv = nfkd [weight]; if (wv == null) continue; int w = (int) wv; // Special: they are ignored in this area. // FIXME: check if it is sane if (0xF900 <= w && w <= 0xFAD9) continue; // LAMESPEC: on Windows some of CJK characters // in 3200-32B0 are incorrectly mapped. They // mix Chinise and Japanese Kanji when // ordering those characters. switch (w) { case 0x32A2: case 0x3298: case 0x3238: case 0x32A9: continue; } AddCharMapCJK ((char) w, ref category); } } // For now it is only for 0x7 category. private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2) { char small = char.MinValue; char vertical = char.MinValue; Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; if (nfkd != null) { object smv = nfkd [(byte) DecompositionSmall]; if (smv != null) small = (char) ((int) smv); object vv = nfkd [(byte) DecompositionVertical]; if (vv != null) vertical = (char) ((int) vv); } // updates index if (small != char.MinValue) // SPECIAL CASE excluded (FIXME: why?) if (small != '\u2024') AddCharMap (small, category, updateCount); // itself AddCharMap (c, category, updateCount, level2); // Since nfkdMap is problematic to have two or more // NFKD to an identical character, here I iterate all. for (int c2 = 0; c2 < char.MaxValue; c2++) { if (decompLength [c2] == 1 && (int) (decompValues [decompIndex [c2]]) == (int) c) { switch (decompType [c2]) { case DecompositionCompat: AddCharMap ((char) c2, category, updateCount, level2); break; } } } if (vertical != char.MinValue) // SPECIAL CASE excluded (FIXME: why?) if (vertical != '\uFE33' && vertical != '\uFE34') AddCharMap (vertical, category, updateCount, level2); } char ToFullWidth (char c) { return ToDecomposed (c, DecompositionFull, false); } char ToFullWidthTail (char c) { return ToDecomposed (c, DecompositionFull, true); } char ToSmallForm (char c) { return ToDecomposed (c, DecompositionSmall, false); } char ToSmallFormTail (char c) { return ToDecomposed (c, DecompositionSmall, true); } char ToDecomposed (char c, byte d, bool tail) { if (decompType [(int) c] != d) return c; int idx = decompIndex [(int) c]; if (tail) idx += decompLength [(int) c] - 1; return (char) decompValues [idx]; } bool ExistsJIS (int cp) { foreach (JISCharacter j in jisJapanese) if (j.CP == cp) return true; return false; } #endregion #region Level 3 properties (Case/Width) private byte ComputeLevel3Weight (char c) { byte b = ComputeLevel3WeightRaw (c); return b > 0 ? (byte) (b + 2) : b; } private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value { // Korean if ('\u11A8' <= c && c <= '\u11F9') return 2; if ('\uFFA0' <= c && c <= '\uFFDC') return 4; if ('\u3130' <= c && c <= '\u3164') return 5; // numbers if ('\u2776' <= c && c <= '\u277F') return 4; if ('\u2780' <= c && c <= '\u2789') return 8; if ('\u2776' <= c && c <= '\u2793') return 0xC; if ('\u2160' <= c && c <= '\u216F') return 0x18; if ('\u2181' <= c && c <= '\u2182') return 0x18; // Arabic if ('\u2135' <= c && c <= '\u2138') return 4; if ('\uFE80' <= c && c < '\uFE8E') { // 2(Isolated)/8(Final)/0x18(Medial) switch (decompType [(int) c]) { case DecompositionIsolated: return 2; case DecompositionFinal: return 8; case DecompositionMedial: return 0x18; } } // actually I dunno the reason why they have weights. switch (c) { case '\u01BC': return 0x10; case '\u06A9': return 0x20; case '\u06AA': return 0x28; } byte ret = 0; switch (c) { case '\u03C2': case '\u2104': case '\u212B': ret |= 8; break; case '\uFE42': ret |= 0xC; break; } // misc switch (decompType [(int) c]) { case DecompositionWide: // case DecompositionSub: // case DecompositionSuper: // ret |= decompType [(int) c]; break; } if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL" ret |= 8; if (isUppercase [(int) c]) // DerivedCoreProperties ret |= 0x10; return ret; } #endregion #region IsIgnorable // FIXME: In the future use DerivedAge.txt to examine character // versions and set those ones that have higher version than // 1.0 as ignorable. static bool IsIgnorable (int i) { switch (i) { case 0: // I guess, those characters are added between // Unicode 1.0 (LCMapString) and Unicode 3.1 // (UnicodeCategory), so they used to be // something like OtherNotAssigned as of Unicode 1.1. case 0x2df: case 0x387: case 0x3d7: case 0x3d8: case 0x3d9: case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6: case 0x400: case 0x40d: case 0x450: case 0x45d: case 0x587: case 0x58a: case 0x5c4: case 0x640: case 0x653: case 0x654: case 0x655: case 0x66d: case 0xb56: case 0x1e9b: case 0x202f: case 0x20ad: case 0x20ae: case 0x20af: case 0x20e2: case 0x20e3: case 0x2139: case 0x213a: case 0x2183: case 0x2425: case 0x2426: case 0x2619: case 0x2670: case 0x2671: case 0x3007: case 0x3190: case 0x3191: case 0xfffc: case 0xfffd: return true; // exceptional characters filtered by the // following conditions. Originally those exceptional // ranges are incorrect (they should not be ignored) // and most of those characters are unfortunately in // those ranges. case 0x4d8: case 0x4d9: case 0x4e8: case 0x4e9: case 0x3036: case 0x303f: case 0x337b: case 0xfb1e: return false; } if ( // The whole Sinhala characters. 0x0D82 <= i && i <= 0x0DF4 // The whole Tibetan characters. || 0x0F00 <= i && i <= 0x0FD1 // The whole Myanmar characters. || 0x1000 <= i && i <= 0x1059 // The whole Etiopic, Cherokee, // Canadian Syllablic, Ogham, Runic, // Tagalog, Hanunoo, Philippine, // Buhid, Tagbanwa, Khmer and Mongorian // characters. || 0x1200 <= i && i <= 0x1DFF // Greek extension characters. || 0x1F00 <= i && i <= 0x1FFF // The whole Braille characters. || 0x2800 <= i && i <= 0x28FF // CJK radical characters. || 0x2E80 <= i && i <= 0x2EF3 // Kangxi radical characters. || 0x2F00 <= i && i <= 0x2FD5 // Ideographic description characters. || 0x2FF0 <= i && i <= 0x2FFB // Bopomofo letter and final || 0x31A0 <= i && i <= 0x31B7 // White square with quadrant characters. || 0x25F0 <= i && i <= 0x25F7 // Ideographic telegraph symbols. || 0x32C0 <= i && i <= 0x32CB || 0x3358 <= i && i <= 0x3370 || 0x33E0 <= i && i <= 0x33FF // The whole YI characters. || 0xA000 <= i && i <= 0xA48C || 0xA490 <= i && i <= 0xA4C6 // American small ligatures || 0xFB13 <= i && i <= 0xFB17 // hebrew, arabic, variation selector. || 0xFB1D <= i && i <= 0xFE2F // Arabic ligatures. || 0xFEF5 <= i && i <= 0xFEFC // FIXME: why are they excluded? || 0x01F6 <= i && i <= 0x01F9 || 0x0218 <= i && i <= 0x0233 || 0x02A9 <= i && i <= 0x02AD || 0x02EA <= i && i <= 0x02EE || 0x0349 <= i && i <= 0x036F || 0x0488 <= i && i <= 0x048F || 0x04D0 <= i && i <= 0x04FF || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0 || 0x06D6 <= i && i <= 0x06ED || 0x06FA <= i && i <= 0x06FE || 0x2048 <= i && i <= 0x204D || 0x20e4 <= i && i <= 0x20ea || 0x213C <= i && i <= 0x214B || 0x21EB <= i && i <= 0x21FF || 0x22F2 <= i && i <= 0x22FF || 0x237B <= i && i <= 0x239A || 0x239B <= i && i <= 0x23CF || 0x24EB <= i && i <= 0x24FF || 0x2596 <= i && i <= 0x259F || 0x25F8 <= i && i <= 0x25FF || 0x2672 <= i && i <= 0x2689 || 0x2768 <= i && i <= 0x2775 || 0x27d0 <= i && i <= 0x27ff || 0x2900 <= i && i <= 0x2aff || 0x3033 <= i && i <= 0x303F || 0x31F0 <= i && i <= 0x31FF || 0x3250 <= i && i <= 0x325F || 0x32B1 <= i && i <= 0x32BF || 0x3371 <= i && i <= 0x337B || 0xFA30 <= i && i <= 0xFA6A ) return true; UnicodeCategory uc = Char.GetUnicodeCategory ((char) i); switch (uc) { case UnicodeCategory.PrivateUse: case UnicodeCategory.Surrogate: return false; // ignored by nature case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned: return true; default: return false; } } // To check IsIgnorable sanity, try the driver below under MS.NET. /* public static void Main () { for (int i = 0; i <= char.MaxValue; i++) Dump (i, IsIgnorable (i)); } static void Dump (int i, bool ignore) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.PrivateUse: case UnicodeCategory.Surrogate: return; // check nothing } string s1 = ""; string s2 = new string ((char) i, 10); int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase); if ((ret == 0) == ignore) return; Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i)); } */ #endregion // IsIgnorable #region IsIgnorableSymbol static bool IsIgnorableSymbol (int i) { if (IsIgnorable (i)) return true; switch (i) { // *Letter case 0x00b5: case 0x01C0: case 0x01C1: case 0x01C2: case 0x01C3: case 0x01F6: case 0x01F7: case 0x01F8: case 0x01F9: case 0x02D0: case 0x02EE: case 0x037A: case 0x03D7: case 0x03F3: case 0x0400: case 0x040d: case 0x0450: case 0x045d: case 0x048C: case 0x048D: case 0x048E: case 0x048F: case 0x0587: case 0x0640: case 0x06E5: case 0x06E6: case 0x06FA: case 0x06FB: case 0x06FC: case 0x093D: case 0x0950: case 0x1E9B: case 0x2139: case 0x3006: case 0x3033: case 0x3034: case 0x3035: case 0xFE7E: case 0xFE7F: // OtherNumber case 0x16EE: case 0x16EF: case 0x16F0: // LetterNumber case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED case 0x3007: // IDEOGRAPHIC NUMBER ZERO case 0x3038: // HANGZHOU NUMERAL TEN case 0x3039: // HANGZHOU NUMERAL TWENTY case 0x303a: // HANGZHOU NUMERAL THIRTY // OtherSymbol case 0x2117: case 0x327F: return true; // ModifierSymbol case 0x02B9: case 0x02BA: case 0x02C2: case 0x02C3: case 0x02C4: case 0x02C5: case 0x02C8: case 0x02CC: case 0x02CD: case 0x02CE: case 0x02CF: case 0x02D2: case 0x02D3: case 0x02D4: case 0x02D5: case 0x02D6: case 0x02D7: case 0x02DE: case 0x02E5: case 0x02E6: case 0x02E7: case 0x02E8: case 0x02E9: case 0x309B: case 0x309C: // OtherPunctuation case 0x055A: // American Apos case 0x05C0: // Hebrew Punct case 0x0E4F: // Thai FONGMAN case 0x0E5A: // Thai ANGKHANKHU case 0x0E5B: // Thai KHOMUT // CurencySymbol case 0x09F2: // Bengali Rupee Mark case 0x09F3: // Bengali Rupee Sign // MathSymbol case 0x221e: // INF. // OtherSymbol case 0x0482: case 0x09FA: case 0x0B70: return false; } // *Letter if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B #if NET_2_0 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT #endif ) return true; UnicodeCategory uc = Char.GetUnicodeCategory ((char) i); switch (uc) { case UnicodeCategory.Surrogate: return false; // inconsistent case UnicodeCategory.SpacingCombiningMark: case UnicodeCategory.EnclosingMark: case UnicodeCategory.NonSpacingMark: case UnicodeCategory.PrivateUse: // NonSpacingMark if (0x064B <= i && i <= 0x0652) // Arabic return true; return false; case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned: return true; default: bool use = false; // OtherSymbols if ( // latin in a circle 0x249A <= i && i <= 0x24E9 || 0x2100 <= i && i <= 0x2132 // Japanese || 0x3196 <= i && i <= 0x31A0 // Korean || 0x3200 <= i && i <= 0x321C // Chinese/Japanese || 0x322A <= i && i <= 0x3243 // CJK || 0x3260 <= i && i <= 0x32B0 || 0x32D0 <= i && i <= 0x3357 || 0x337B <= i && i <= 0x33DD ) use = !Char.IsLetterOrDigit ((char) i); if (use) return false; // This "Digit" rule is mystery. // It filters some symbols out. if (Char.IsLetterOrDigit ((char) i)) return false; if (Char.IsNumber ((char) i)) return false; if (Char.IsControl ((char) i) || Char.IsSeparator ((char) i) || Char.IsPunctuation ((char) i)) return true; if (Char.IsSymbol ((char) i)) return true; // FIXME: should check more return false; } } // To check IsIgnorableSymbol sanity, try the driver below under MS.NET. /* public static void Main () { CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo; for (int i = 0; i <= char.MaxValue; i++) { UnicodeCategory uc = Char.GetUnicodeCategory ((char) i); if (uc == UnicodeCategory.Surrogate) continue; bool ret = IsIgnorableSymbol (i); string s1 = "TEST "; string s2 = "TEST " + (char) i; int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols); if (ret != (result == 0)) Console.WriteLine ("{0} : {1:x}[{2}]({3})", ret ? "should not ignore" : "should ignore", i,(char) i, uc); } } */ #endregion #region NonSpacing static bool IsIgnorableNonSpacing (int i) { if (IsIgnorable (i)) return true; switch (i) { case 0x02C8: case 0x02DE: case 0x0559: case 0x055A: case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6: case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F: return true; case 0x02D0: case 0x0670: case 0x0901: case 0x0902: case 0x094D: case 0x0962: case 0x0963: case 0x0A41: case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B: case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82: case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC: case 0x0CCD: case 0x0E4E: return false; } if (0x02b9 <= i && i <= 0x02c5 || 0x02cc <= i && i <= 0x02d7 || 0x02e4 <= i && i <= 0x02ef || 0x20DD <= i && i <= 0x20E0 ) return true; if (0x064B <= i && i <= 0x00652 || 0x0941 <= i && i <= 0x0948 || 0x0AC1 <= i && i <= 0x0ACD || 0x0C3E <= i && i <= 0x0C4F || 0x0E31 <= i && i <= 0x0E3F ) return false; return Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark; } // We can reuse IsIgnorableSymbol testcode // for IsIgnorableNonSpacing. #endregion } struct CharMapEntry { public byte Category; public byte Level1; public byte Level2; // It is always single byte. public bool Defined; public CharMapEntry (byte category, byte level1, byte level2) { Category = category; Level1 = level1; Level2 = level2; Defined = true; } } class JISCharacter { public readonly int CP; public readonly int JIS; public JISCharacter (int cp, int cpJIS) { CP = cp; JIS = cpJIS; } } class JISComparer : IComparer { public static readonly JISComparer Instance = new JISComparer (); public int Compare (object o1, object o2) { JISCharacter j1 = (JISCharacter) o1; JISCharacter j2 = (JISCharacter) o2; return j2.JIS - j1.JIS; } } class NonJISCharacter { public readonly int CP; public readonly string Name; public NonJISCharacter (int cp, string name) { CP = cp; Name = name; } } class NonJISComparer : IComparer { public static readonly NonJISComparer Instance = new NonJISComparer (); public int Compare (object o1, object o2) { NonJISCharacter j1 = (NonJISCharacter) o1; NonJISCharacter j2 = (NonJISCharacter) o2; return string.CompareOrdinal (j1.Name, j2.Name); } } class DecimalDictionaryValueComparer : IComparer { public static readonly DecimalDictionaryValueComparer Instance = new DecimalDictionaryValueComparer (); private DecimalDictionaryValueComparer () { } public int Compare (object o1, object o2) { DictionaryEntry e1 = (DictionaryEntry) o1; DictionaryEntry e2 = (DictionaryEntry) o2; // FIXME: in case of 0, compare decomposition categories int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value); if (ret != 0) return ret; int i1 = (int) e1.Key; int i2 = (int) e2.Key; return i1 - i2; } } class StringDictionaryValueComparer : IComparer { public static readonly StringDictionaryValueComparer Instance = new StringDictionaryValueComparer (); private StringDictionaryValueComparer () { } public int Compare (object o1, object o2) { DictionaryEntry e1 = (DictionaryEntry) o1; DictionaryEntry e2 = (DictionaryEntry) o2; int ret = String.Compare ((string) e1.Value, (string) e2.Value); if (ret != 0) return ret; int i1 = (int) e1.Key; int i2 = (int) e2.Key; return i1 - i2; } } class UCAComparer : IComparer { public static readonly UCAComparer Instance = new UCAComparer (); private UCAComparer () { } public int Compare (object o1, object o2) { char i1 = (char) o1; char i2 = (char) o2; int l1 = CollationElementTable.GetSortKeyCount (i1); int l2 = CollationElementTable.GetSortKeyCount (i2); int l = l1 > l2 ? l2 : l1; for (int i = 0; i < l; i++) { SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i); SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i); int v = k1.Primary - k2.Primary; if (v != 0) return v; v = k1.Secondary - k2.Secondary; if (v != 0) return v; v = k1.Thirtiary - k2.Thirtiary; if (v != 0) return v; v = k1.Quarternary - k2.Quarternary; if (v != 0) return v; } return l1 - l2; } } }