2005-07-13 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
index 66de6bea712614c470f4922b0a22233747ea7bf2..a510f6822eb19b9efc5be0facb44a92c9b4dabf4 100644 (file)
@@ -96,8 +96,9 @@ namespace Mono.Globalization.Unicode
                byte [] diacritical = new byte [char.MaxValue + 1];
 
                string [] diacritics = new string [] {
-                       // LATIN
-                       "WITH VERTICAL LINE ABOVE;",
+                       // LATIN, CYRILLIC etc.
+                       "UPTURN", "DOUBLE-STRUCK",
+                       "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
                        "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
                        "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
                        "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
@@ -126,7 +127,7 @@ namespace Mono.Globalization.Unicode
                        //
                        "WITH OVERLINE",
                        "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
-                       " DOUBLE GRAVE;",
+                       " DOUBLE GRAVE",
                        " INVERTED BREVE",
                        "ROMAN NUMERAL",
                        " PRECEDED BY APOSTROPHE",
@@ -146,6 +147,7 @@ namespace Mono.Globalization.Unicode
                        " CIRCUMFLEX AND DOT BELOW",
                        " BREVE AND DOT BELOW",
                        " DOT BELOW AND MACRON",
+                       " TONE TWO",
                        " HORN AND HOOK ABOVE",
                        " HORN AND DOT",
                        // CIRCLED, PARENTHESIZED and so on
@@ -155,7 +157,7 @@ namespace Mono.Globalization.Unicode
                        };
                byte [] diacriticWeights = new byte [] {
                        // LATIN.
-                       5,
+                       3, 3, 5, 5,
                        0xF, 0xE, 0x12,
                        0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
                        0x17, 0x19, 0x1A, 0x1B, 0x1C,
@@ -171,7 +173,7 @@ namespace Mono.Globalization.Unicode
                        //
                        0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
                        0x69, 0x69, 0x6A, 0x6D, 0x6E,
-                       0x95, 0xAA,
+                       0x87, 0x95, 0xAA,
                        // CIRCLED, PARENTHESIZED and so on.
                        0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
                        0xF3, 0xF3, 0xF3
@@ -185,7 +187,6 @@ namespace Mono.Globalization.Unicode
                        0xE50, 0xE60, 0xED0, 0xEE0
                        };
 
-               char [] orderedCyrillic;
                char [] orderedGurmukhi;
                char [] orderedGujarati;
                char [] orderedGeorgian;
@@ -212,11 +213,9 @@ namespace Mono.Globalization.Unicode
 
                // cp -> level1 value
                Hashtable arabicLetterPrimaryValues = new Hashtable ();
-               Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
 
                // letterName -> cp
                Hashtable arabicNameMap = new Hashtable ();
-               Hashtable cyrillicNameMap = new Hashtable ();
 
                // cp -> Hashtable [decompType] -> cp
                Hashtable nfkdMap = new Hashtable ();
@@ -475,6 +474,7 @@ sw.Close ();
 #if Binary
                        MemoryStream ms = new MemoryStream ();
                        BinaryWriter binary = new BinaryWriter (ms);
+                       binary.Write (cjk.Length);
 #endif
                        for (int i = 0; i < cjk.Length; i++) {
                                if (i + offset == max)
@@ -627,6 +627,7 @@ sw.Close ();
 
                        ParseJISOrder (cp932); // in prior to ParseUnidata()
                        ParseUnidata (unidata);
+                       ModifyUnidata ();
                        ParseDerivedCoreProperties (derivedCoreProps);
                        ParseScripts (scripts);
                        ParseCJK (chXML, jaXML, koXML);
@@ -919,7 +920,7 @@ sw.Close ();
                        }
 
                        // Box names
-                       if (0x2500 <= cp && cp < 0x25B0) {
+                       if (0x2500 <= cp && cp < 0x2600) {
                                int value = 0;
                                // flags:
                                // up:1 down:2 right:4 left:8 vert:16 horiz:32
@@ -944,49 +945,93 @@ sw.Close ();
                                        10, 10, 11, 11,
                                        12, 12, 13, 13,
                                        14, 14, 14, 14};
-                               if (s.IndexOf ("BOX DRAWINGS ") > 0) {
+                               if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
                                        int flag = 0;
-                                       if (s.IndexOf (" UP") > 0)
+                                       if (s.IndexOf (" UP") >= 0)
                                                flag |= 1;
-                                       if (s.IndexOf (" DOWN") > 0)
+                                       if (s.IndexOf (" DOWN") >= 0)
                                                flag |= 2;
-                                       if (s.IndexOf (" RIGHT") > 0)
+                                       if (s.IndexOf (" RIGHT") >= 0)
                                                flag |= 4;
-                                       if (s.IndexOf (" LEFT") > 0)
+                                       if (s.IndexOf (" LEFT") >= 0)
                                                flag |= 8;
-                                       if (s.IndexOf (" VERTICAL") > 0)
+                                       if (s.IndexOf (" VERTICAL") >= 0)
                                                flag |= 16;
-                                       if (s.IndexOf (" HORIZONTAL") > 0)
+                                       if (s.IndexOf (" HORIZONTAL") >= 0)
                                                flag |= 32;
 
                                        int fidx = flags.IndexOf (flag);
                                        value = fidx < 0 ? fidx : offsets [fidx];
-                               } else if (s.IndexOf ("BLOCK") > 0) {
-                                       if (s.IndexOf ("ONE EIGHTH") > 0)
+                               } else if (s.IndexOf ("BLOCK") >= 0) {
+                                       if (s.IndexOf ("ONE EIGHTH") >= 0)
                                                value = 0x12;
-                                       else if (s.IndexOf ("ONE QUARTER") > 0)
+                                       else if (s.IndexOf ("ONE QUARTER") >= 0)
                                                value = 0x13;
-                                       else if (s.IndexOf ("THREE EIGHTHS") > 0)
+                                       else if (s.IndexOf ("THREE EIGHTHS") >= 0)
                                                value = 0x14;
-                                       else if (s.IndexOf ("HALF") > 0)
+                                       else if (s.IndexOf ("HALF") >= 0)
                                                value = 0x15;
-                                       else if (s.IndexOf ("FIVE EIGHTHS") > 0)
+                                       else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
                                                value = 0x16;
-                                       else if (s.IndexOf ("THREE QUARTERS") > 0)
+                                       else if (s.IndexOf ("THREE QUARTERS") >= 0)
                                                value = 0x17;
-                                       else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
+                                       else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
                                                value = 0x18;
                                        else
                                                value = 0x19;
-                               } else if (s.IndexOf ("SHADE") > 0)
+                               }
+                               else if (s.IndexOf ("SHADE") >= 0)
                                        value = 0x19;
+                               else if (s.IndexOf ("SQUARE") >= 0)
+                                       value = 0xBC - 0xE5;
+                               else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
+                                       value = 0xBE - 0xE5;
+                               else if (s.IndexOf ("RECTANGLE") >= 0)
+                                       value = 0xBD - 0xE5;
+                               else if (s.IndexOf ("PARALLELOGRAM") >= 0)
+                                       value = 0xBF - 0xE5;
+                               else if (s.IndexOf ("TRIANGLE") >= 0) {
+                                       if (s.IndexOf ("UP-POINTING") >= 0)
+                                               value = 0xC0 - 0xE5;
+                                       else if (s.IndexOf ("RIGHT-POINTING") >= 0)
+                                               value = 0xC1 - 0xE5;
+                                       else if (s.IndexOf ("DOWN-POINTING") >= 0)
+                                               value = 0xC2 - 0xE5;
+                                       else if (s.IndexOf ("LEFT-POINTING") >= 0)
+                                               value = 0xC3 - 0xE5;
+                               }
+                               else if (s.IndexOf ("POINTER") >= 0) {
+                                       if (s.IndexOf ("RIGHT-POINTING") >= 0)
+                                               value = 0xC4 - 0xE5;
+                                       else if (s.IndexOf ("LEFT-POINTING") >= 0)
+                                               value = 0xC5 - 0xE5;
+                               }
+                               else if (s.IndexOf ("DIAMOND") >= 0)
+                                       value = 0xC6 - 0xE5;
+                               else if (s.IndexOf ("FISHEYE") >= 0)
+                                       value = 0xC7 - 0xE5;
+                               else if (s.IndexOf ("LOZENGE") >= 0)
+                                       value = 0xC8 - 0xE5;
+                               else if (s.IndexOf ("BULLSEYE") >= 0)
+                                       value = 0xC9 - 0xE5;
+                               else if (s.IndexOf ("CIRCLE") >= 0) {
+                                       if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
+                                               value = 0xCA - 0xE5;
+                                       else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
+                                               value = 0xCB - 0xE5;
+                                       else
+                                               value = 0xC9 - 0xE5;
+                               }
+                               if (0x25DA <= cp && cp <= 0x25E5)
+                                       value = 0xCD + cp - 0x25DA - 0xE5;
+
                                // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
                                switch (cp) {
                                case 0x2571: value = 0xF; break;
                                case 0x2572: value = 0x10; break;
                                case 0x2573: value = 0x11; break;
                                }
-                               if (value >= 0)
+                               if (value != 0)
                                        boxValues.Add (new DictionaryEntry (
                                                cp, value));
                        }
@@ -1001,6 +1046,14 @@ sw.Close ();
                                sortableCharNames.Add (new DictionaryEntry (
                                        cp, name.Substring (7)));
 
+                       if (Char.GetUnicodeCategory ((char) cp) ==
+                               UnicodeCategory.MathSymbol) {
+                               if (name.StartsWith ("CIRCLED "))
+                                       diacritical [cp] = 0xEE;
+                               if (name.StartsWith ("SQUARED "))
+                                       diacritical [cp] = 0xEF;
+                       }
+
                        // diacritical weights by character name
 if (diacritics.Length != diacriticWeights.Length)
 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
@@ -1029,26 +1082,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
                                diacritical [cp] |= 0xF4;
 
-                       // Cyrillic letter name
-                       if (0x0430 <= cp && cp <= 0x0486 &&
-                               Char.IsLetter ((char) cp)) {
-                               byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
-                               // Get primary letter name i.e.
-                               // XXX part of CYRILLIC LETTER XXX yyy
-                               // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
-                               string letterName =
-                                       name.Substring (name.IndexOf ("LETTER ") + 7);
-                               int tmpIdx = letterName.IndexOf (' ');
-                               letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
-//Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
-                               if (cyrillicNameMap.ContainsKey (letterName))
-                                       value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
-                               else
-                                       cyrillicNameMap [letterName] = cp;
-
-                               cyrillicLetterPrimaryValues [cp] = value;
-                       }
-
                        // Arabic letter name
                        if (0x0621 <= cp && cp <= 0x064A &&
                                Char.GetUnicodeCategory ((char) cp)
@@ -1264,7 +1297,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
 
                void ParseScripts (string filename)
                {
-                       ArrayList cyrillic = new ArrayList ();
                        ArrayList gurmukhi = new ArrayList ();
                        ArrayList gujarati = new ArrayList ();
                        ArrayList georgian = new ArrayList ();
@@ -1294,11 +1326,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                                continue;
 
                                        switch (value) {
-                                       case "Cyrillic":
-                                               for (int x = cp; x <= cpEnd; x++)
-                                                       if (!IsIgnorable (x))
-                                                               cyrillic.Add ((char) x);
-                                               break;
                                        case "Gurmukhi":
                                                for (int x = cp; x <= cpEnd; x++)
                                                        if (!IsIgnorable (x))
@@ -1322,12 +1349,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                        }
                                }
                        }
-                       cyrillic.Sort (UCAComparer.Instance);
                        gurmukhi.Sort (UCAComparer.Instance);
                        gujarati.Sort (UCAComparer.Instance);
                        georgian.Sort (UCAComparer.Instance);
                        thaana.Sort (UCAComparer.Instance);
-                       orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
                        orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
                        orderedGujarati = (char []) gujarati.ToArray (typeof (char));
                        orderedGeorgian = (char []) georgian.ToArray (typeof (char));
@@ -1414,16 +1439,54 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                        category = "ja";
                        arr = cjkJA;
                        offset = 0;//char.MaxValue - arr.Length;
-                       doc.Load (jaXML);
-                       s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
+
+                       // SPECIAL CASES
+                       arr [0x4EDD] = 0x8002; // Chinese repetition mark?
+                       arr [0x337B] = 0x8004; // Those 4 characters are Gengou
+                       arr [0x337E] = 0x8005;
+                       arr [0x337D] = 0x8006;
+                       arr [0x337C] = 0x8007;
+
                        v = 0x8008;
-                       foreach (char c in s) {
+                       foreach (JISCharacter jc in jisJapanese) {
+                               if (jc.JIS < 0x8800)
+                                       continue;
+                               char c = (char) jc.CP;
+
                                if (c < '\u4E00')
                                        Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
                                else {
                                        arr [(int) c - offset] = (ushort) v++;
                                        if (v % 256 == 0)
                                                v += 2;
+
+                                       // SPECIAL CASES:
+                                       if (c == '\u662D') // U+337C
+                                               continue;
+                                       if (c == '\u5927') // U+337D
+                                               continue;
+                                       if (c == '\u5E73') // U+337B
+                                               continue;
+                                       if (c == '\u660E') // U+337E
+                                               continue;
+                                       if (c == '\u9686') // U+F9DC
+                                               continue;
+
+                                       // FIXME: there are still remaining
+                                       // characters after U+FA0C.
+//                                     for (int k = 0; k < char.MaxValue; k++) {
+                                       for (int k = 0; k < '\uFA0D'; k++) {
+                                               if (decompIndex [k] == 0 || IsIgnorable (k))
+                                                       continue;
+                                               if (decompValues [decompIndex [k]] == c /*&&
+                                                       decompLength [k] == 1*/ ||
+                                                       decompLength [k] == 3 &&
+                                                       decompValues [decompIndex [k] + 1] == c) {
+                                                       arr [k - offset] = (ushort) v++;
+                                                       if (v % 256 == 0)
+                                                               v += 2;
+                                               }
+                                       }
                                }
                        }
 
@@ -1479,16 +1542,8 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                        }
                }
 
-               void ModifyParsedValues ()
+               void ModifyUnidata ()
                {
-                       // number, secondary weights
-                       byte weight = 0x38;
-                       int [] numarr = numberSecondaryWeightBounds;
-                       for (int i = 0; i < numarr.Length; i += 2, weight++)
-                               for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
-                                       if (Char.IsNumber ((char) cp))
-                                               diacritical [cp] = weight;
-
                        // Modify some decomposition equivalence
                        decompType [0xFE31] = 0;
                        decompIndex [0xFE31] = 0;
@@ -1503,6 +1558,36 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                        for (int i = 0x3260; i <= 0x327B; i++)
                                diacritical [i] = 0xC;
 
+                       // LAMESPEC: these remapping should not be done.
+                       // Windows have incorrect CJK compat mappings.
+                       decompValues [decompIndex [0x32A9]] = 0x91AB;
+                       decompLength [0x323B] = 1;
+                       decompValues [decompIndex [0x323B]] = 0x5B78;
+                       decompValues [decompIndex [0x32AB]] = 0x5B78;
+                       decompValues [decompIndex [0x32A2]] = 0x5BEB;
+                       decompLength [0x3238] = 1;
+                       decompValues [decompIndex [0x3238]] = 0x52DE;
+                       decompValues [decompIndex [0x3298]] = 0x52DE;
+
+                       // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
+                       decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
+                       decompValues [decompIndex [0xFA0C]] = 0x5140;
+                       decompLength [0xFA0C] = 1;
+                       decompIndex [0xF929] = decompLength [0xF929] = 0;
+
+                       decompValues [decompIndex [0xF92C]] = 0x90DE;
+               }
+
+               void ModifyParsedValues ()
+               {
+                       // number, secondary weights
+                       byte weight = 0x38;
+                       int [] numarr = numberSecondaryWeightBounds;
+                       for (int i = 0; i < numarr.Length; i += 2, weight++)
+                               for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
+                                       if (Char.IsNumber ((char) cp))
+                                               diacritical [cp] = weight;
+
                        // Update name part of named characters
                        for (int i = 0; i < sortableCharNames.Count; i++) {
                                DictionaryEntry de =
@@ -1640,8 +1725,20 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                        map [i] = new CharMapEntry (
                                                0x1, 0x1, diacritical [i]);
 
-                       fillIndex [0x1] = 0xAC;
-                       for (int i = 0x07A6; i <= 0x07B0; i++)
+                       fillIndex [0x1] = 0x94;
+                       // syriac dotted nonspacing marks
+                       AddCharMap ('\u0732', 0x1, 1);
+                       AddCharMap ('\u0735', 0x1, 1);
+                       AddCharMap ('\u0738', 0x1, 1);
+                       AddCharMap ('\u0739', 0x1, 1);
+                       AddCharMap ('\u073C', 0x1, 1);
+                       fillIndex [0x1] = 0x9F;
+                       for (int i = 0x0730; i <= 0x07B0; i++)
+                               if (!IsIgnorable (i) && !map [i].Defined)
+                                       AddCharMap ((char) i, 0x1, 1);
+
+                       fillIndex [0x1] = 0x0C;
+                       for (int i = 0x0EC8; i <= 0x0ECD; i++)
                                if (!IsIgnorable (i))
                                        AddCharMap ((char) i, 0x1, 1);
 
@@ -1651,6 +1748,16 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                        fillIndex [0x1] = 0xDC;
                        for (int i = 0x20d0; i <= 0x20e1; i++)
                                AddCharMap ((char) i, 0x1, 1);
+
+                       // They are not part of Nonspacing marks, but have
+                       // only diacritical weight.
+                       for (int i = 0x3099; i <= 0x309C; i++)
+                               map [i] = new CharMapEntry (1, 1, 1);
+                       map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
+                       map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
+                       for (int i = 0x30FC; i <= 0x30FE; i++)
+                               map [i] = new CharMapEntry (0xFF, 0xFF, 1);
+
                        #endregion
 
 
@@ -1698,12 +1805,17 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                boxLv2 [i] = 3;
                        foreach (DictionaryEntry de in boxValues) {
                                int cp = (int) de.Key;
-                               int idx = (int) de.Value;
+                               int off = (int) de.Value;
                                if (map [cp].Defined)
                                        continue;
-                               fillIndex [0x9] = (byte) (0xE5 + idx);
-                               AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
-                               boxLv2 [idx]++;
+                               if (off < 0) {
+                                       fillIndex [0x9] = (byte) (0xE5 + off);
+                                       AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
+                               }
+                               else {
+                                       fillIndex [0x9] = (byte) (0xE5 + off);
+                                       AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
+                               }
                        }
                        // Some special characters (slanted)
                        fillIndex [0x9] = 0xF4;
@@ -1719,7 +1831,8 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                uc = Char.GetUnicodeCategory ((char) cp);
                                if (!IsIgnorable (cp) &&
                                        uc == UnicodeCategory.CurrencySymbol &&
-                                       cp != '$')
+                                       cp != '$' ||
+                                       cp == 0xAC)
                                        AddCharMapGroup ((char) cp, 0xA, 1, 0);
                        }
                        // byte other symbols
@@ -1728,10 +1841,23 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                        continue; // SPECIAL: skip FIXME: why?
                                uc = Char.GetUnicodeCategory ((char) cp);
                                if (!IsIgnorable (cp) &&
-                                       uc == UnicodeCategory.OtherSymbol)
+                                       uc == UnicodeCategory.OtherSymbol ||
+                                       cp == '\u00B5' || cp == '\u00B7')
                                        AddCharMapGroup ((char) cp, 0xA, 1, 0);
                        }
 
+                       fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
+                       for (int cp = 0x2020; cp <= 0x2031; cp++)
+                               if (Char.IsPunctuation ((char) cp))
+                                       AddCharMap ((char) cp, 0xA, 1, 0);
+                       // SPECIAL CASES: why?
+                       AddCharMap ('\u203B', 0xA, 1, 0);
+                       AddCharMap ('\u2040', 0xA, 1, 0);
+                       AddCharMap ('\u2041', 0xA, 1, 0);
+                       AddCharMap ('\u2042', 0xA, 1, 0);
+
+                       for (int cp = 0x20A0; cp <= 0x20AB; cp++)
+                               AddCharMap ((char) cp, 0xA, 1, 0);
                        fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
                        for (int cp = 0x2600; cp <= 0x2613; cp++)
                                AddCharMap ((char) cp, 0xA, 1, 0);
@@ -1924,45 +2050,83 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                if (Char.IsLetter ((char) i))
                                        AddLetterMap ((char) i, 0xF, 1);
 
-                       // Cyrillic - character name order
-                       fillIndex [0x10] = 0x6;
-//*
-for (int i = 0; i < orderedCyrillic.Length; i++)
-Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
+                       // Cyrillic.
+                       // Cyrillic letters are sorted like Latin letters i.e. 
+                       // containing culture-specific letters between the
+                       // standard Cyrillic sequence.
+                       //
+                       // We can't use UCA here; it has different sorting.
+                       char [] orderedCyrillic = new char [] {
+                               '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
+                               '\u0452', // DJE for Serbocroatian
+                               '\u0435',
+                               '\u0454', // IE for Ukrainian
+                               '\u0436', '\u0437',
+                               '\u0455', // DZE
+                               '\u0438',
+                               '\u0456', // Byelorussian-Ukrainian I
+                               '\u0457', // YI
+                               '\u0439',
+                               '\u0458', // JE
+                               '\u043A', '\u043B',
+                               '\u0459', // LJE
+                               '\u043C', '\u043D',
+                               '\u045A', // NJE
+                               '\u043E',
+                               // 4E9 goes here.
+                               '\u043F', '\u0440', '\u0441', '\u0442',
+                               '\u045B', // TSHE for Serbocroatian
+                               '\u0443',
+                               '\u045E', // Short U for Byelorussian
+                               '\u04B1', // Straight U w/ stroke (diacritical!)
+                               '\u0444', '\u0445', '\u0446', '\u0447',
+                               '\u045F', // DZHE
+                               '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
+                               '\u044D', '\u044E', '\u044F'};
+
+                       // For some characters here is a map to basic cyrillic
+                       // letters. See UnicodeData.txt character names for
+                       // the sources. Here I simply declare an equiv. array.
+                       // The content characters are map from U+490(,491),
+                       // skipping small letters.
+                       char [] cymap_src = new char [] {
+                               '\u0433', '\u0433', '\u0433', '\u0436',
+                               '\u0437', '\u043A', '\u043A', '\u043A',
+                               '\u043A', '\u043D', '\u043D', '\u043F',
+                               '\u0445', '\u0441', '\u0442', '\u0443',
+                               '\u0443', '\u0445', '\u0446', '\u0447',
+                               '\u0447', '\u0432', '\u0435', '\u0435',
+                               '\u0406', '\u0436', '\u043A', '\u043D',
+                               '\u0447', '\u0435'};
+
+                       fillIndex [0x10] = 0x8D;
+                       for (int i = 0x0460; i < 0x0481; i++) {
+                               if (Char.IsLetter ((char) i)) {
+                                       if (i == 0x0476)
+                                               // U+476/477 have the same
+                                               // primary weight as U+474/475.
+                                               fillIndex [0x10] -= 3;
+                                       AddLetterMap ((char) i, 0x10, 3);
+                               }
+                       }
 
-                       // table which is moslty from UCA DUCET.
+                       fillIndex [0x10] = 0x6;
                        for (int i = 0; i < orderedCyrillic.Length; i++) {
                                char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
                                if (!IsIgnorable ((int) c) &&
-                                       c <= '\u045C' &&
-                                       Char.IsLetter (c)) {
+                                       Char.IsLetter (c) &&
+                                       !map [c].Defined) {
                                        AddLetterMap (c, 0x10, 0);
                                        fillIndex [0x10] += 3;
                                }
                        }
-                       /*
-                       for (int i = 0x0460; i < 0x0481; i++) {
-                               if (Char.IsLetter ((char) i)) {
-                                       AddLetterMap ((char) i, 0x10, 0);
-                                       fillIndex [0x10] += 3;
-                               }
-                       }
-                       */
-/*
-                       for (int i = 0x0400; i <= 0x0486; i++) {
-                               if (!Char.IsLetter ((char) i)) {
-//                                     AddCharMap ((char) i, 0x1, 1);
-                                       continue;
-                               }
-                               if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
-                                       Console.Error.WriteLine ("no value for {0:x04}", i);
-                                       continue;
-                               }
-                               fillIndex [0x10] = 
-                                       (byte) cyrillicLetterPrimaryValues [i];
-                               AddLetterMap ((char) i, 0x10, 0);
+
+                       for (int i = 0; i < cymap_src.Length; i++) {
+                               char c = cymap_src [i];
+                               fillIndex [0x10] = map [c].Level1;
+                               AddLetterMap ((char) (0x0490 + i * 2),
+                                       0x10, 0);
                        }
-*/
 
                        // Armenian
                        fillIndex [0x11] = 0x3;
@@ -1972,15 +2136,18 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 
                        // Hebrew
                        // -Letters
-                       fillIndex [0x12] = 0x3;
+                       fillIndex [0x12] = 0x2;
                        for (int i = 0x05D0; i < 0x05FF; i++)
                                if (Char.IsLetter ((char) i))
                                        AddLetterMap ((char) i, 0x12, 1);
                        // -Accents
                        fillIndex [0x1] = 0x3;
-                       for (int i = 0x0591; i <= 0x05C2; i++)
+                       for (int i = 0x0591; i <= 0x05C2; i++) {
+                               if (i == 0x05A3 || i == 0x05BB)
+                                       fillIndex [0x1]++;
                                if (i != 0x05BE)
                                        AddCharMap ((char) i, 0x1, 1);
+                       }
 
                        // Arabic
                        fillIndex [0x1] = 0x8E;
@@ -2559,10 +2726,8 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                // Insert 3001 after ',' and 3002 after '.'
                                if (i == 0x2C)
                                        AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
-                               else if (i == 0x2E) {
-                                       fillIndex [0x7]--;
+                               else if (i == 0x2E)
                                        AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
-                               }
                                else if (i == 0x3A)
                                        AddCharMap ('\uFE30', 0x7, 1, 0);
                        }
@@ -2598,7 +2763,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                case UnicodeCategory.FinalQuotePunctuation:
                                case UnicodeCategory.ModifierSymbol:
                                        // SPECIAL CASES: // 0xA
-                                       if (0x2020 <= i && i <= 0x2042)
+                                       if (0x2020 <= i && i <= 0x2031)
                                                continue;
                                        AddCharMapGroup ((char) i, 0x7, 1, 0);
                                        break;
@@ -2675,15 +2840,21 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                        AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
 
                        for (int cp = 0; cp < 0x2300; cp++) {
-                               if (cp == 0x200)
-                                       cp = 0x2200; // skip to 2200
                                if (cp == 0xAC) // SPECIAL CASE: skip
                                        continue;
+                               if (cp == 0x200) {
+                                       cp = 0x2200; // skip to 2200
+                                       fillIndex [0x8] = 0x21;
+                               }
+                               if (cp == 0x2295)
+                                       fillIndex [0x8] = 0x3;
+                               if (cp == 0x22B2)
+                                       fillIndex [0x8] = 0xB9;
                                if (!map [cp].Defined &&
 //                                     Char.GetUnicodeCategory ((char) cp) ==
 //                                     UnicodeCategory.MathSymbol)
                                        Char.IsSymbol ((char) cp))
-                                       AddCharMapGroup ((char) cp, 0x8, 1, 0);
+                                       AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
                                // SPECIAL CASES: no idea why Windows sorts as such
                                switch (cp) {
                                case 0x3E: