2005-07-13 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
index a3fedcb84ddaedc7014229f39c4a20a0ed2acc6b..a510f6822eb19b9efc5be0facb44a92c9b4dabf4 100644 (file)
@@ -96,14 +96,18 @@ namespace Mono.Globalization.Unicode
                byte [] diacritical = new byte [char.MaxValue + 1];
 
                string [] diacritics = new string [] {
-                       // LATIN
-                       "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
+                       // LATIN, CYRILLIC etc.
+                       "UPTURN", "DOUBLE-STRUCK",
+                       "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
+                       "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
+                       "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
                        "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
-                       " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
-                       " OGONEK;", " CEDILLA;",
+                       " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
+                       "WITH OGONEK;", "WITH CEDILLA;",
                        //
                        " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
-                       " STROKE;", " CIRCUMFLEX AND ACUTE;",
+                       "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
+                       "STROKE OVERLAY",
                        " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
                        " DIAERESIS AND GRAVE;",
                        " BREVE AND ACUTE;",
@@ -121,24 +125,29 @@ namespace Mono.Globalization.Unicode
                        " CEDILLA AND BREVE",
                        " OGONEK AND MACRON",
                        //
-                       " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
-                       " DOUBLE GRAVE;",
+                       "WITH OVERLINE",
+                       "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
+                       " DOUBLE GRAVE",
                        " INVERTED BREVE",
+                       "ROMAN NUMERAL",
                        " PRECEDED BY APOSTROPHE",
-                       " HORN;",
+                       "WITH HORN;",
                        " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
                        " PALATAL HOOK",
                        " DOT BELOW;",
                        " RETROFLEX;", "DIAERESIS BELOW",
                        " RING BELOW",
+                       //
                        " CIRCUMFLEX BELOW", "HORN AND ACUTE",
                        " BREVE BELOW;", " HORN AND GRAVE",
                        " TILDE BELOW",
+                       " TOPBAR",
                        " DOT BELOW AND DOT ABOVE",
                        " RIGHT HALF RING", " HORN AND TILDE",
                        " CIRCUMFLEX AND DOT BELOW",
                        " BREVE AND DOT BELOW",
                        " DOT BELOW AND MACRON",
+                       " TONE TWO",
                        " HORN AND HOOK ABOVE",
                        " HORN AND DOT",
                        // CIRCLED, PARENTHESIZED and so on
@@ -148,20 +157,23 @@ namespace Mono.Globalization.Unicode
                        };
                byte [] diacriticWeights = new byte [] {
                        // LATIN.
+                       3, 3, 5, 5,
+                       0xF, 0xE, 0x12,
                        0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
                        0x17, 0x19, 0x1A, 0x1B, 0x1C,
                        //
-                       0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
+                       0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
                        0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
                        //
                        0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
                        0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
                        //
-                       0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
+                       0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
                        0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
-                       0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 
+                       //
+                       0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
                        0x69, 0x69, 0x6A, 0x6D, 0x6E,
-                       0x95, 0xAA,
+                       0x87, 0x95, 0xAA,
                        // CIRCLED, PARENTHESIZED and so on.
                        0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
                        0xF3, 0xF3, 0xF3
@@ -175,7 +187,6 @@ namespace Mono.Globalization.Unicode
                        0xE50, 0xE60, 0xED0, 0xEE0
                        };
 
-               char [] orderedCyrillic;
                char [] orderedGurmukhi;
                char [] orderedGujarati;
                char [] orderedGeorgian;
@@ -202,11 +213,9 @@ namespace Mono.Globalization.Unicode
 
                // cp -> level1 value
                Hashtable arabicLetterPrimaryValues = new Hashtable ();
-               Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
 
                // letterName -> cp
                Hashtable arabicNameMap = new Hashtable ();
-               Hashtable cyrillicNameMap = new Hashtable ();
 
                // cp -> Hashtable [decompType] -> cp
                Hashtable nfkdMap = new Hashtable ();
@@ -465,6 +474,7 @@ sw.Close ();
 #if Binary
                        MemoryStream ms = new MemoryStream ();
                        BinaryWriter binary = new BinaryWriter (ms);
+                       binary.Write (cjk.Length);
 #endif
                        for (int i = 0; i < cjk.Length; i++) {
                                if (i + offset == max)
@@ -617,6 +627,7 @@ sw.Close ();
 
                        ParseJISOrder (cp932); // in prior to ParseUnidata()
                        ParseUnidata (unidata);
+                       ModifyUnidata ();
                        ParseDerivedCoreProperties (derivedCoreProps);
                        ParseScripts (scripts);
                        ParseCJK (chXML, jaXML, koXML);
@@ -765,7 +776,10 @@ sw.Close ();
                        this.decompValues = (int [])
                                decompValues.ToArray (typeof (int));
                }
-               
+
+               char previousLatinTarget = char.MinValue;
+               byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
+
                void ProcessUnidataLine (string s, ArrayList decompValues)
                {
                        int idx = s.IndexOf ('#');
@@ -785,18 +799,32 @@ sw.Close ();
 
                        string name = values [0];
 
+                       // SPECIAL CASE: rename some characters for diacritical
+                       // remapping. FIXME: why are they different?
+                       // FIXME: it's still not working.
+                       if (cp == 0x018B || cp == 0x018C)
+                               name = name.Replace ("TOPBAR", "STROKE");
+
                        // isSmallCapital
                        if (s.IndexOf ("SMALL CAPITAL") > 0)
                                isSmallCapital [cp] = true;
 
                        // latin mapping by character name
-                       if (s.IndexOf ("LATIN") > 0) {
+                       if (s.IndexOf ("LATIN") >= 0) {
                                int lidx = s.IndexOf ("LETTER DOTLESS ");
                                int offset = lidx + 15;
                                if (lidx < 0) {
                                        lidx = s.IndexOf ("LETTER TURNED ");
                                        offset = lidx + 14;
                                }
+                               if (lidx < 0) {
+                                       lidx = s.IndexOf ("LETTER CAPITAL ");
+                                       offset = lidx + 15;
+                               }
+                               if (lidx < 0) {
+                                       lidx = s.IndexOf ("LETTER SCRIPT ");
+                                       offset = lidx + 14;
+                               }
                                if (lidx < 0) {
                                        lidx = s.IndexOf ("LETTER ");
                                        offset = lidx + 7;
@@ -807,18 +835,47 @@ sw.Close ();
                                if ('A' <= c && c <= 'Z' &&
                                        (n == ' ') || n == ';') {
                                        target = c;
-                               // FIXME: they are still not working fine.
-                               if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
-                                       target = 'O';
+                                       // FIXME: After 'Z', I cannot reset this state.
+                                       previousLatinTarget = c == 'Z' ? char.MinValue : c;
+                               }
+
                                if (s.Substring (offset).StartsWith ("ALPHA"))
                                        target = 'A';
-                               if (target != char.MinValue);
-                                       ArrayList entry = (ArrayList) latinMap [c];
+                               else if (s.Substring (offset).StartsWith ("TONE SIX"))
+                                       target = 'B';
+                               else if (s.Substring (offset).StartsWith ("OPEN O"))
+                                       target = 'C';
+                               else if (s.Substring (offset).StartsWith ("SCHWA"))
+                                       target = 'E';
+                               else if (s.Substring (offset).StartsWith ("ENG"))
+                                       target = 'N';
+                               else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
+                                       target = 'O';
+                               else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
+                                       target = 'R';
+                               else if (s.Substring (offset).StartsWith ("TONE TWO"))
+                                       target = 'S';
+                               else if (s.Substring (offset).StartsWith ("ESH"))
+                                       target = 'S';
+
+                               if (target == char.MinValue)
+                                       target = previousLatinTarget;
+
+                               if (target != char.MinValue) {
+                                       ArrayList entry = (ArrayList) latinMap [target];
                                        if (entry == null) {
                                                entry = new ArrayList ();
-                                               latinMap [c] = entry;
+                                               latinMap [target] = entry;
                                        }
                                        entry.Add (cp);
+                                       // FIXME: This secondary weight is hack.
+                                       // They are here because they must not
+                                       // be identical to the corresponding
+                                       // ASCII latins.
+                                       if (c != target && diacritical [cp] == 0) {
+                                               diacriticalOffset [c - 'A']++;
+                                               diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
+                                       }
                                }
                        }
 
@@ -863,7 +920,7 @@ sw.Close ();
                        }
 
                        // Box names
-                       if (0x2500 <= cp && cp < 0x25B0) {
+                       if (0x2500 <= cp && cp < 0x2600) {
                                int value = 0;
                                // flags:
                                // up:1 down:2 right:4 left:8 vert:16 horiz:32
@@ -888,42 +945,93 @@ sw.Close ();
                                        10, 10, 11, 11,
                                        12, 12, 13, 13,
                                        14, 14, 14, 14};
-                               if (s.IndexOf ("BOX DRAWINGS ") > 0) {
+                               if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
                                        int flag = 0;
-                                       if (s.IndexOf (" UP") > 0)
+                                       if (s.IndexOf (" UP") >= 0)
                                                flag |= 1;
-                                       if (s.IndexOf (" DOWN") > 0)
+                                       if (s.IndexOf (" DOWN") >= 0)
                                                flag |= 2;
-                                       if (s.IndexOf (" RIGHT") > 0)
+                                       if (s.IndexOf (" RIGHT") >= 0)
                                                flag |= 4;
-                                       if (s.IndexOf (" LEFT") > 0)
+                                       if (s.IndexOf (" LEFT") >= 0)
                                                flag |= 8;
-                                       if (s.IndexOf (" VERTICAL") > 0)
+                                       if (s.IndexOf (" VERTICAL") >= 0)
                                                flag |= 16;
-                                       if (s.IndexOf (" HORIZONTAL") > 0)
+                                       if (s.IndexOf (" HORIZONTAL") >= 0)
                                                flag |= 32;
 
                                        int fidx = flags.IndexOf (flag);
                                        value = fidx < 0 ? fidx : offsets [fidx];
-                               } else if (s.IndexOf ("BLOCK") > 0) {
-                                       if (s.IndexOf ("ONE EIGHTH") > 0)
+                               } else if (s.IndexOf ("BLOCK") >= 0) {
+                                       if (s.IndexOf ("ONE EIGHTH") >= 0)
                                                value = 0x12;
-                                       else if (s.IndexOf ("ONE QUARTER") > 0)
+                                       else if (s.IndexOf ("ONE QUARTER") >= 0)
                                                value = 0x13;
-                                       else if (s.IndexOf ("THREE EIGHTHS") > 0)
+                                       else if (s.IndexOf ("THREE EIGHTHS") >= 0)
                                                value = 0x14;
-                                       else if (s.IndexOf ("HALF") > 0)
+                                       else if (s.IndexOf ("HALF") >= 0)
                                                value = 0x15;
-                                       else if (s.IndexOf ("FIVE EIGHTHS") > 0)
+                                       else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
                                                value = 0x16;
-                                       else if (s.IndexOf ("THREE QUARTERS") > 0)
+                                       else if (s.IndexOf ("THREE QUARTERS") >= 0)
                                                value = 0x17;
-                                       else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
+                                       else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
                                                value = 0x18;
                                        else
                                                value = 0x19;
                                }
-                               if (value >= 0)
+                               else if (s.IndexOf ("SHADE") >= 0)
+                                       value = 0x19;
+                               else if (s.IndexOf ("SQUARE") >= 0)
+                                       value = 0xBC - 0xE5;
+                               else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
+                                       value = 0xBE - 0xE5;
+                               else if (s.IndexOf ("RECTANGLE") >= 0)
+                                       value = 0xBD - 0xE5;
+                               else if (s.IndexOf ("PARALLELOGRAM") >= 0)
+                                       value = 0xBF - 0xE5;
+                               else if (s.IndexOf ("TRIANGLE") >= 0) {
+                                       if (s.IndexOf ("UP-POINTING") >= 0)
+                                               value = 0xC0 - 0xE5;
+                                       else if (s.IndexOf ("RIGHT-POINTING") >= 0)
+                                               value = 0xC1 - 0xE5;
+                                       else if (s.IndexOf ("DOWN-POINTING") >= 0)
+                                               value = 0xC2 - 0xE5;
+                                       else if (s.IndexOf ("LEFT-POINTING") >= 0)
+                                               value = 0xC3 - 0xE5;
+                               }
+                               else if (s.IndexOf ("POINTER") >= 0) {
+                                       if (s.IndexOf ("RIGHT-POINTING") >= 0)
+                                               value = 0xC4 - 0xE5;
+                                       else if (s.IndexOf ("LEFT-POINTING") >= 0)
+                                               value = 0xC5 - 0xE5;
+                               }
+                               else if (s.IndexOf ("DIAMOND") >= 0)
+                                       value = 0xC6 - 0xE5;
+                               else if (s.IndexOf ("FISHEYE") >= 0)
+                                       value = 0xC7 - 0xE5;
+                               else if (s.IndexOf ("LOZENGE") >= 0)
+                                       value = 0xC8 - 0xE5;
+                               else if (s.IndexOf ("BULLSEYE") >= 0)
+                                       value = 0xC9 - 0xE5;
+                               else if (s.IndexOf ("CIRCLE") >= 0) {
+                                       if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
+                                               value = 0xCA - 0xE5;
+                                       else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
+                                               value = 0xCB - 0xE5;
+                                       else
+                                               value = 0xC9 - 0xE5;
+                               }
+                               if (0x25DA <= cp && cp <= 0x25E5)
+                                       value = 0xCD + cp - 0x25DA - 0xE5;
+
+                               // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
+                               switch (cp) {
+                               case 0x2571: value = 0xF; break;
+                               case 0x2572: value = 0x10; break;
+                               case 0x2573: value = 0x11; break;
+                               }
+                               if (value != 0)
                                        boxValues.Add (new DictionaryEntry (
                                                cp, value));
                        }
@@ -933,17 +1041,27 @@ sw.Close ();
                        if (0x2100 <= cp && cp <= 0x213F &&
                                Char.IsSymbol ((char) cp))
                                sortableCharNames.Add (
-                                       new DictionaryEntry (cp, values [0]));
+                                       new DictionaryEntry (cp, name));
                        else if (0x3380 <= cp && cp <= 0x33DD)
                                sortableCharNames.Add (new DictionaryEntry (
-                                       cp, values [0].Substring (7)));
+                                       cp, name.Substring (7)));
+
+                       if (Char.GetUnicodeCategory ((char) cp) ==
+                               UnicodeCategory.MathSymbol) {
+                               if (name.StartsWith ("CIRCLED "))
+                                       diacritical [cp] = 0xEE;
+                               if (name.StartsWith ("SQUARED "))
+                                       diacritical [cp] = 0xEF;
+                       }
 
                        // diacritical weights by character name
 if (diacritics.Length != diacriticWeights.Length)
 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
                        for (int d = 0; d < diacritics.Length; d++) {
                                if (s.IndexOf (diacritics [d]) > 0) {
-                                       diacritical [cp] |= diacriticWeights [d];
+                                       diacritical [cp] += diacriticWeights [d];
+                                       if (s.IndexOf ("COMBINING") >= 0)
+                                               diacritical [cp] -= (byte) 2;
                                        continue;
                                }
                                // also process "COMBINING blah" here
@@ -954,35 +1072,16 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la
                                if (tmp.IndexOf ("WITH ") == 0)
                                        tmp = tmp.Substring (4);
                                tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
-                               if (values [0] == tmp)
+                               if (name == tmp)
                                        diacritical [cp] = (byte) (diacriticWeights [d] - 2);
-if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", values [0], tmp, cp);
+//if (name == tmp)
+//Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
                        }
                        // Two-step grep required for it.
                        if (s.IndexOf ("FULL STOP") > 0 &&
                                (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
                                diacritical [cp] |= 0xF4;
 
-                       // Cyrillic letter name
-                       if (0x0430 <= cp && cp <= 0x0486 &&
-                               Char.IsLetter ((char) cp)) {
-                               byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
-                               // Get primary letter name i.e.
-                               // XXX part of CYRILLIC LETTER XXX yyy
-                               // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
-                               string letterName =
-                                       values [0].Substring (values [0].IndexOf ("LETTER ") + 7);
-                               int tmpIdx = letterName.IndexOf (' ');
-                               letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
-//Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
-                               if (cyrillicNameMap.ContainsKey (letterName))
-                                       value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
-                               else
-                                       cyrillicNameMap [letterName] = cp;
-
-                               cyrillicLetterPrimaryValues [cp] = value;
-                       }
-
                        // Arabic letter name
                        if (0x0621 <= cp && cp <= 0x064A &&
                                Char.GetUnicodeCategory ((char) cp)
@@ -1007,8 +1106,8 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                                (cp == 0x0640) ?
                                                // 0x0640 is special: it does
                                                // not start with ARABIC LETTER
-                                               values [0] :
-                                               values [0].Substring (14);
+                                               name :
+                                               name.Substring (14);
                                        int tmpIdx = letterName.IndexOf (' ');
                                        letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
@@ -1024,7 +1123,7 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                        // Japanese square letter
                        if (0x3300 <= cp && cp <= 0x3357)
                                if (!ExistsJIS (cp))
-                                       nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
+                                       nonJisJapanese.Add (new NonJISCharacter (cp, name));
 
                        // normalizationType
                        string decomp = values [4];
@@ -1198,7 +1297,6 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
 
                void ParseScripts (string filename)
                {
-                       ArrayList cyrillic = new ArrayList ();
                        ArrayList gurmukhi = new ArrayList ();
                        ArrayList gujarati = new ArrayList ();
                        ArrayList georgian = new ArrayList ();
@@ -1228,11 +1326,6 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                                continue;
 
                                        switch (value) {
-                                       case "Cyrillic":
-                                               for (int x = cp; x <= cpEnd; x++)
-                                                       if (!IsIgnorable (x))
-                                                               cyrillic.Add ((char) x);
-                                               break;
                                        case "Gurmukhi":
                                                for (int x = cp; x <= cpEnd; x++)
                                                        if (!IsIgnorable (x))
@@ -1256,12 +1349,10 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                        }
                                }
                        }
-                       cyrillic.Sort (UCAComparer.Instance);
                        gurmukhi.Sort (UCAComparer.Instance);
                        gujarati.Sort (UCAComparer.Instance);
                        georgian.Sort (UCAComparer.Instance);
                        thaana.Sort (UCAComparer.Instance);
-                       orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
                        orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
                        orderedGujarati = (char []) gujarati.ToArray (typeof (char));
                        orderedGeorgian = (char []) georgian.ToArray (typeof (char));
@@ -1348,16 +1439,54 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                        category = "ja";
                        arr = cjkJA;
                        offset = 0;//char.MaxValue - arr.Length;
-                       doc.Load (jaXML);
-                       s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
+
+                       // SPECIAL CASES
+                       arr [0x4EDD] = 0x8002; // Chinese repetition mark?
+                       arr [0x337B] = 0x8004; // Those 4 characters are Gengou
+                       arr [0x337E] = 0x8005;
+                       arr [0x337D] = 0x8006;
+                       arr [0x337C] = 0x8007;
+
                        v = 0x8008;
-                       foreach (char c in s) {
+                       foreach (JISCharacter jc in jisJapanese) {
+                               if (jc.JIS < 0x8800)
+                                       continue;
+                               char c = (char) jc.CP;
+
                                if (c < '\u4E00')
                                        Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
                                else {
                                        arr [(int) c - offset] = (ushort) v++;
                                        if (v % 256 == 0)
                                                v += 2;
+
+                                       // SPECIAL CASES:
+                                       if (c == '\u662D') // U+337C
+                                               continue;
+                                       if (c == '\u5927') // U+337D
+                                               continue;
+                                       if (c == '\u5E73') // U+337B
+                                               continue;
+                                       if (c == '\u660E') // U+337E
+                                               continue;
+                                       if (c == '\u9686') // U+F9DC
+                                               continue;
+
+                                       // FIXME: there are still remaining
+                                       // characters after U+FA0C.
+//                                     for (int k = 0; k < char.MaxValue; k++) {
+                                       for (int k = 0; k < '\uFA0D'; k++) {
+                                               if (decompIndex [k] == 0 || IsIgnorable (k))
+                                                       continue;
+                                               if (decompValues [decompIndex [k]] == c /*&&
+                                                       decompLength [k] == 1*/ ||
+                                                       decompLength [k] == 3 &&
+                                                       decompValues [decompIndex [k] + 1] == c) {
+                                                       arr [k - offset] = (ushort) v++;
+                                                       if (v % 256 == 0)
+                                                               v += 2;
+                                               }
+                                       }
                                }
                        }
 
@@ -1413,16 +1542,8 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                        }
                }
 
-               void ModifyParsedValues ()
+               void ModifyUnidata ()
                {
-                       // number, secondary weights
-                       byte weight = 0x38;
-                       int [] numarr = numberSecondaryWeightBounds;
-                       for (int i = 0; i < numarr.Length; i += 2, weight++)
-                               for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
-                                       if (Char.IsNumber ((char) cp))
-                                               diacritical [cp] = weight;
-
                        // Modify some decomposition equivalence
                        decompType [0xFE31] = 0;
                        decompIndex [0xFE31] = 0;
@@ -1437,6 +1558,36 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                        for (int i = 0x3260; i <= 0x327B; i++)
                                diacritical [i] = 0xC;
 
+                       // LAMESPEC: these remapping should not be done.
+                       // Windows have incorrect CJK compat mappings.
+                       decompValues [decompIndex [0x32A9]] = 0x91AB;
+                       decompLength [0x323B] = 1;
+                       decompValues [decompIndex [0x323B]] = 0x5B78;
+                       decompValues [decompIndex [0x32AB]] = 0x5B78;
+                       decompValues [decompIndex [0x32A2]] = 0x5BEB;
+                       decompLength [0x3238] = 1;
+                       decompValues [decompIndex [0x3238]] = 0x52DE;
+                       decompValues [decompIndex [0x3298]] = 0x52DE;
+
+                       // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
+                       decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
+                       decompValues [decompIndex [0xFA0C]] = 0x5140;
+                       decompLength [0xFA0C] = 1;
+                       decompIndex [0xF929] = decompLength [0xF929] = 0;
+
+                       decompValues [decompIndex [0xF92C]] = 0x90DE;
+               }
+
+               void ModifyParsedValues ()
+               {
+                       // number, secondary weights
+                       byte weight = 0x38;
+                       int [] numarr = numberSecondaryWeightBounds;
+                       for (int i = 0; i < numarr.Length; i += 2, weight++)
+                               for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
+                                       if (Char.IsNumber ((char) cp))
+                                               diacritical [cp] = weight;
+
                        // Update name part of named characters
                        for (int i = 0; i < sortableCharNames.Count; i++) {
                                DictionaryEntry de =
@@ -1574,12 +1725,39 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                        map [i] = new CharMapEntry (
                                                0x1, 0x1, diacritical [i]);
 
+                       fillIndex [0x1] = 0x94;
+                       // syriac dotted nonspacing marks
+                       AddCharMap ('\u0732', 0x1, 1);
+                       AddCharMap ('\u0735', 0x1, 1);
+                       AddCharMap ('\u0738', 0x1, 1);
+                       AddCharMap ('\u0739', 0x1, 1);
+                       AddCharMap ('\u073C', 0x1, 1);
+                       fillIndex [0x1] = 0x9F;
+                       for (int i = 0x0730; i <= 0x07B0; i++)
+                               if (!IsIgnorable (i) && !map [i].Defined)
+                                       AddCharMap ((char) i, 0x1, 1);
+
+                       fillIndex [0x1] = 0x0C;
+                       for (int i = 0x0EC8; i <= 0x0ECD; i++)
+                               if (!IsIgnorable (i))
+                                       AddCharMap ((char) i, 0x1, 1);
+
                        // LAMESPEC: It should not stop at '\u20E1'. There are
                        // a few more characters (that however results in 
                        // overflow of level 2 unless we start before 0xDD).
                        fillIndex [0x1] = 0xDC;
                        for (int i = 0x20d0; i <= 0x20e1; i++)
                                AddCharMap ((char) i, 0x1, 1);
+
+                       // They are not part of Nonspacing marks, but have
+                       // only diacritical weight.
+                       for (int i = 0x3099; i <= 0x309C; i++)
+                               map [i] = new CharMapEntry (1, 1, 1);
+                       map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
+                       map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
+                       for (int i = 0x30FC; i <= 0x30FE; i++)
+                               map [i] = new CharMapEntry (0xFF, 0xFF, 1);
+
                        #endregion
 
 
@@ -1627,12 +1805,17 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                boxLv2 [i] = 3;
                        foreach (DictionaryEntry de in boxValues) {
                                int cp = (int) de.Key;
-                               int idx = (int) de.Value;
+                               int off = (int) de.Value;
                                if (map [cp].Defined)
                                        continue;
-                               fillIndex [0x9] = (byte) (0xE5 + idx);
-                               AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
-                               boxLv2 [idx]++;
+                               if (off < 0) {
+                                       fillIndex [0x9] = (byte) (0xE5 + off);
+                                       AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
+                               }
+                               else {
+                                       fillIndex [0x9] = (byte) (0xE5 + off);
+                                       AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
+                               }
                        }
                        // Some special characters (slanted)
                        fillIndex [0x9] = 0xF4;
@@ -1648,7 +1831,8 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                uc = Char.GetUnicodeCategory ((char) cp);
                                if (!IsIgnorable (cp) &&
                                        uc == UnicodeCategory.CurrencySymbol &&
-                                       cp != '$')
+                                       cp != '$' ||
+                                       cp == 0xAC)
                                        AddCharMapGroup ((char) cp, 0xA, 1, 0);
                        }
                        // byte other symbols
@@ -1657,10 +1841,23 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                        continue; // SPECIAL: skip FIXME: why?
                                uc = Char.GetUnicodeCategory ((char) cp);
                                if (!IsIgnorable (cp) &&
-                                       uc == UnicodeCategory.OtherSymbol)
+                                       uc == UnicodeCategory.OtherSymbol ||
+                                       cp == '\u00B5' || cp == '\u00B7')
                                        AddCharMapGroup ((char) cp, 0xA, 1, 0);
                        }
 
+                       fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
+                       for (int cp = 0x2020; cp <= 0x2031; cp++)
+                               if (Char.IsPunctuation ((char) cp))
+                                       AddCharMap ((char) cp, 0xA, 1, 0);
+                       // SPECIAL CASES: why?
+                       AddCharMap ('\u203B', 0xA, 1, 0);
+                       AddCharMap ('\u2040', 0xA, 1, 0);
+                       AddCharMap ('\u2041', 0xA, 1, 0);
+                       AddCharMap ('\u2042', 0xA, 1, 0);
+
+                       for (int cp = 0x20A0; cp <= 0x20AB; cp++)
+                               AddCharMap ((char) cp, 0xA, 1, 0);
                        fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
                        for (int cp = 0x2600; cp <= 0x2613; cp++)
                                AddCharMap ((char) cp, 0xA, 1, 0);
@@ -1853,45 +2050,83 @@ if (values [0] == tmp) Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'
                                if (Char.IsLetter ((char) i))
                                        AddLetterMap ((char) i, 0xF, 1);
 
-                       // Cyrillic - character name order
-                       fillIndex [0x10] = 0x6;
-//*
-for (int i = 0; i < orderedCyrillic.Length; i++)
-Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
+                       // Cyrillic.
+                       // Cyrillic letters are sorted like Latin letters i.e. 
+                       // containing culture-specific letters between the
+                       // standard Cyrillic sequence.
+                       //
+                       // We can't use UCA here; it has different sorting.
+                       char [] orderedCyrillic = new char [] {
+                               '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
+                               '\u0452', // DJE for Serbocroatian
+                               '\u0435',
+                               '\u0454', // IE for Ukrainian
+                               '\u0436', '\u0437',
+                               '\u0455', // DZE
+                               '\u0438',
+                               '\u0456', // Byelorussian-Ukrainian I
+                               '\u0457', // YI
+                               '\u0439',
+                               '\u0458', // JE
+                               '\u043A', '\u043B',
+                               '\u0459', // LJE
+                               '\u043C', '\u043D',
+                               '\u045A', // NJE
+                               '\u043E',
+                               // 4E9 goes here.
+                               '\u043F', '\u0440', '\u0441', '\u0442',
+                               '\u045B', // TSHE for Serbocroatian
+                               '\u0443',
+                               '\u045E', // Short U for Byelorussian
+                               '\u04B1', // Straight U w/ stroke (diacritical!)
+                               '\u0444', '\u0445', '\u0446', '\u0447',
+                               '\u045F', // DZHE
+                               '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
+                               '\u044D', '\u044E', '\u044F'};
+
+                       // For some characters here is a map to basic cyrillic
+                       // letters. See UnicodeData.txt character names for
+                       // the sources. Here I simply declare an equiv. array.
+                       // The content characters are map from U+490(,491),
+                       // skipping small letters.
+                       char [] cymap_src = new char [] {
+                               '\u0433', '\u0433', '\u0433', '\u0436',
+                               '\u0437', '\u043A', '\u043A', '\u043A',
+                               '\u043A', '\u043D', '\u043D', '\u043F',
+                               '\u0445', '\u0441', '\u0442', '\u0443',
+                               '\u0443', '\u0445', '\u0446', '\u0447',
+                               '\u0447', '\u0432', '\u0435', '\u0435',
+                               '\u0406', '\u0436', '\u043A', '\u043D',
+                               '\u0447', '\u0435'};
+
+                       fillIndex [0x10] = 0x8D;
+                       for (int i = 0x0460; i < 0x0481; i++) {
+                               if (Char.IsLetter ((char) i)) {
+                                       if (i == 0x0476)
+                                               // U+476/477 have the same
+                                               // primary weight as U+474/475.
+                                               fillIndex [0x10] -= 3;
+                                       AddLetterMap ((char) i, 0x10, 3);
+                               }
+                       }
 
-                       // table which is moslty from UCA DUCET.
+                       fillIndex [0x10] = 0x6;
                        for (int i = 0; i < orderedCyrillic.Length; i++) {
                                char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
                                if (!IsIgnorable ((int) c) &&
-                                       c <= '\u045C' &&
-                                       Char.IsLetter (c)) {
+                                       Char.IsLetter (c) &&
+                                       !map [c].Defined) {
                                        AddLetterMap (c, 0x10, 0);
                                        fillIndex [0x10] += 3;
                                }
                        }
-                       /*
-                       for (int i = 0x0460; i < 0x0481; i++) {
-                               if (Char.IsLetter ((char) i)) {
-                                       AddLetterMap ((char) i, 0x10, 0);
-                                       fillIndex [0x10] += 3;
-                               }
-                       }
-                       */
-/*
-                       for (int i = 0x0400; i <= 0x0486; i++) {
-                               if (!Char.IsLetter ((char) i)) {
-//                                     AddCharMap ((char) i, 0x1, 1);
-                                       continue;
-                               }
-                               if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
-                                       Console.Error.WriteLine ("no value for {0:x04}", i);
-                                       continue;
-                               }
-                               fillIndex [0x10] = 
-                                       (byte) cyrillicLetterPrimaryValues [i];
-                               AddLetterMap ((char) i, 0x10, 0);
+
+                       for (int i = 0; i < cymap_src.Length; i++) {
+                               char c = cymap_src [i];
+                               fillIndex [0x10] = map [c].Level1;
+                               AddLetterMap ((char) (0x0490 + i * 2),
+                                       0x10, 0);
                        }
-*/
 
                        // Armenian
                        fillIndex [0x11] = 0x3;
@@ -1901,15 +2136,18 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
 
                        // Hebrew
                        // -Letters
-                       fillIndex [0x12] = 0x3;
+                       fillIndex [0x12] = 0x2;
                        for (int i = 0x05D0; i < 0x05FF; i++)
                                if (Char.IsLetter ((char) i))
                                        AddLetterMap ((char) i, 0x12, 1);
                        // -Accents
                        fillIndex [0x1] = 0x3;
-                       for (int i = 0x0591; i <= 0x05C2; i++)
+                       for (int i = 0x0591; i <= 0x05C2; i++) {
+                               if (i == 0x05A3 || i == 0x05BB)
+                                       fillIndex [0x1]++;
                                if (i != 0x05BE)
                                        AddCharMap ((char) i, 0x1, 1);
+                       }
 
                        // Arabic
                        fillIndex [0x1] = 0x8E;
@@ -2166,7 +2404,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                        continue;
                                AddCharMap (c, 0x21, 0);
                                if (c < '\u10F6')
-                                       AddCharMap ((char) (c - 0x30), 0x21, 0, 0x12);
+                                       AddCharMap ((char) (c - 0x30), 0x21, 0);
                                fillIndex [0x21] += 5;
                        }
 
@@ -2193,6 +2431,16 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                                AddKanaMap (cp, kanaLines [gyo]);
                                        fillIndex [0x22]++;
 
+                                       if (cp == 0x30AB) {
+                                               // add small 'ka' (before normal one)
+                                               AddKanaMap (0x30F5, 1);
+                                               kanaOffset++;
+                                       }
+                                       if (cp == 0x30B1) {
+                                               // add small 'ke' (before normal one)
+                                               AddKanaMap (0x30F6, 1);
+                                               kanaOffset++;
+                                       }
                                        if (cp == 0x3061) {
                                                // add small 'Tsu' (before normal one)
                                                AddKanaMap (0x3063, 1);
@@ -2308,7 +2556,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                        + "<{\u1113 \u1116}, \u3165,"
                                + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
                                + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE  >"
-                       + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
+                       + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
                        + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
                                + "[\u11D1 \u11D2], \u11B2,"
                                + "[\u11D3 \u11D5], \u11B3,"
@@ -2478,10 +2726,8 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                // Insert 3001 after ',' and 3002 after '.'
                                if (i == 0x2C)
                                        AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
-                               else if (i == 0x2E) {
-                                       fillIndex [0x7]--;
+                               else if (i == 0x2E)
                                        AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
-                               }
                                else if (i == 0x3A)
                                        AddCharMap ('\uFE30', 0x7, 1, 0);
                        }
@@ -2492,10 +2738,12 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                if (IsIgnorable (i))
                                        continue;
 
-                               // FIXME: actually this reset should not be done
-                               // but here I put for easy goal.
+                               // FIXME: actually those reset should not be 
+                               // done but here I put for easy goal.
                                if (i == 0x0700)
                                        fillIndex [0x7] = 0xE2;
+                               if (i == 0x2016)
+                                       fillIndex [0x7] = 0x77;
 
                                // SPECIAL CASES:
                                switch (i) {
@@ -2515,7 +2763,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                case UnicodeCategory.FinalQuotePunctuation:
                                case UnicodeCategory.ModifierSymbol:
                                        // SPECIAL CASES: // 0xA
-                                       if (0x2020 <= i && i <= 0x2042)
+                                       if (0x2020 <= i && i <= 0x2031)
                                                continue;
                                        AddCharMapGroup ((char) i, 0x7, 1, 0);
                                        break;
@@ -2526,6 +2774,9 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                }
                        }
                        // Control pictures
+                       // FIXME: it should not need to reset level 1, but
+                       // it's for easy goal.
+                       fillIndex [0x7] = 0xB6;
                        for (int i = 0x2400; i <= 0x2421; i++)
                                AddCharMap ((char) i, 0x7, 1, 0);
                        #endregion
@@ -2589,15 +2840,21 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                        AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
 
                        for (int cp = 0; cp < 0x2300; cp++) {
-                               if (cp == 0x200)
-                                       cp = 0x2200; // skip to 2200
                                if (cp == 0xAC) // SPECIAL CASE: skip
                                        continue;
+                               if (cp == 0x200) {
+                                       cp = 0x2200; // skip to 2200
+                                       fillIndex [0x8] = 0x21;
+                               }
+                               if (cp == 0x2295)
+                                       fillIndex [0x8] = 0x3;
+                               if (cp == 0x22B2)
+                                       fillIndex [0x8] = 0xB9;
                                if (!map [cp].Defined &&
 //                                     Char.GetUnicodeCategory ((char) cp) ==
 //                                     UnicodeCategory.MathSymbol)
                                        Char.IsSymbol ((char) cp))
-                                       AddCharMapGroup ((char) cp, 0x8, 1, 0);
+                                       AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
                                // SPECIAL CASES: no idea why Windows sorts as such
                                switch (cp) {
                                case 0x3E:
@@ -2637,7 +2894,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                        mod = diacritical [i];
                                        break;
                                case 0x13: // Arabic
-                                       if (diacritical [i] == 0)
+                                       if (diacritical [i] == 0 && i >= 0xFE8D)
                                                mod = 0x8; // default for arabic
                                        break;
                                }
@@ -2991,6 +3248,9 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                        // CJK compat
                        if ('\u3192' <= c && c <= '\u319F')
                                return 0;
+                       // Japanese reading marks
+                       if (c == '\u3001' || c == '\u3002')
+                               return 2;
                        // Korean
                        if ('\u11A8' <= c && c <= '\u11F9')
                                return 2;
@@ -3000,6 +3260,9 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                                return 5;
                        if ('\u3165' <= c && c <= '\u318E')
                                return 4;
+                       // Georgian Capital letters
+                       if ('\u10A0' <= c && c <= '\u10C5')
+                               return 0x10;
                        // numbers
                        if ('\u2776' <= c && c <= '\u277F')
                                return 4;
@@ -3008,13 +3271,13 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
                        if ('\u2776' <= c && c <= '\u2793')
                                return 0xC;
                        if ('\u2160' <= c && c <= '\u216F')
-                               return 0x18;
+                               return 0x10;
                        if ('\u2181' <= c && c <= '\u2182')
                                return 0x18;
                        // Arabic
                        if ('\u2135' <= c && c <= '\u2138')
                                return 4;
-                       if ('\uFE80' <= c && c < '\uFE8E') {
+                       if ('\uFE80' <= c && c < '\uFF00') {
                                // 2(Isolated)/8(Final)/0x18(Medial)
                                switch (decompType [(int) c]) {
                                case DecompositionIsolated: