3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
34 namespace Mono.Globalization.Unicode
36 internal class MSCompatSortKeyTableGenerator
38 public static void Main (string [] args)
40 new MSCompatSortKeyTableGenerator ().Run (args);
43 const int DecompositionWide = 1; // fixed
44 const int DecompositionSub = 2; // fixed
45 const int DecompositionSmall = 3;
46 const int DecompositionIsolated = 4;
47 const int DecompositionInitial = 5;
48 const int DecompositionFinal = 6;
49 const int DecompositionMedial = 7;
50 const int DecompositionNoBreak = 8;
51 const int DecompositionVertical = 9;
52 const int DecompositionFraction = 0xA;
53 const int DecompositionFont = 0xB;
54 const int DecompositionSuper = 0xC; // fixed
55 const int DecompositionFull = 0xE;
56 const int DecompositionNarrow = 0xD;
57 const int DecompositionCircle = 0xF;
58 const int DecompositionSquare = 0x10;
59 const int DecompositionCompat = 0x11;
61 TextWriter Result = Console.Out;
63 byte [] fillIndex = new byte [256]; // by category
64 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
66 char [] specialIgnore = new char [] {
67 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
68 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
71 // FIXME: need more love (as always)
72 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
73 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
74 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
75 '\u0292', '\u01BE', '\u0298'};
76 byte [] alphaWeights = new byte [] {
77 2, 9, 0xA, 0x1A, 0x21,
78 0x23, 0x25, 0x2C, 0x32, 0x35,
79 0x36, 0x48, 0x51, 0x70, 0x7C,
80 0x7E, 0x89, 0x8A, 0x91, 0x99,
81 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
82 0xA9, 0xAA, 0xB3, 0xB4};
84 bool [] isSmallCapital = new bool [char.MaxValue + 1];
85 bool [] isUppercase = new bool [char.MaxValue + 1];
87 byte [] decompType = new byte [char.MaxValue + 1];
88 int [] decompIndex = new int [char.MaxValue + 1];
89 int [] decompLength = new int [char.MaxValue + 1];
91 decimal [] decimalValue = new decimal [char.MaxValue + 1];
93 byte [] diacritical = new byte [char.MaxValue + 1];
95 string [] diacritics = new string [] {
97 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
98 " CIRCUMFLEX;", " DIAERESIS;", " CARON;", "WITH BREVE;",
99 " DIALYTIKA AND TONOS;", "WITH MACRON;", " TILDE;", " RING ABOVE;",
100 " OGONEK;", " CEDILLA;",
101 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
102 " STROKE;", " CIRCUMFLEX AND ACUTE;",
103 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
104 " DIAERESIS AND GRAVE;",
106 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
107 " MACRON AND ACUTE;",
108 " MACRON AND GRAVE;",
109 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
110 " RING ABOVE AND ACUTE",
111 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
112 " CIRCUMFLEX AND TILDE",
113 " TILDE AND DIAERESIS",
116 " CEDILLA AND BREVE",
117 " OGONEK AND MACRON",
118 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
121 " PRECEDED BY APOSTROPHE",
123 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
126 " RETROFLEX;", "DIAERESIS BELOW",
128 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
129 " BREVE BELOW;", " HORN AND GRAVE",
131 " DOT BELOW AND DOT ABOVE",
132 " RIGHT HALF RING", " HORN AND TILDE",
133 " CIRCUMFLEX AND DOT BELOW",
134 " BREVE AND DOT BELOW",
135 " DOT BELOW AND MACRON",
136 " HORN AND HOOK ABOVE",
138 // CIRCLED, PARENTHESIZED and so on
139 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
140 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
142 byte [] diacriticWeights = new byte [] {
144 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
145 0x17, 0x19, 0x1A, 0x1B, 0x1C,
146 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
147 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
148 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
149 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
150 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
151 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
152 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
153 0x69, 0x69, 0x6A, 0x6D, 0x6E,
155 // CIRCLED, PARENTHESIZED and so on.
156 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
159 int [] numberSecondaryWeightBounds = new int [] {
160 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
161 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
162 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
163 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
164 0xE50, 0xE60, 0xED0, 0xEE0
167 char [] orderedCyrillic;
168 char [] orderedGurmukhi;
169 char [] orderedGujarati;
170 char [] orderedGeorgian;
171 char [] orderedThaana;
173 static readonly char [] orderedTamilConsonants = new char [] {
174 // based on traditional Tamil consonants, except for
175 // Grantha (where Microsoft breaks traditionalism).
176 // http://www.angelfire.com/empire/thamizh/padanGaL
177 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
178 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
179 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
180 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
183 // cp -> level1 value
184 Hashtable arabicLetterPrimaryValues = new Hashtable ();
187 Hashtable arabicNameMap = new Hashtable ();
189 // cp -> Hashtable [decompType] -> cp
190 Hashtable nfkdMap = new Hashtable ();
192 // Latin letter -> ArrayList [int]
193 Hashtable latinMap = new Hashtable ();
195 ArrayList jisJapanese = new ArrayList ();
196 ArrayList nonJisJapanese = new ArrayList ();
198 ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
199 ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
200 ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
201 ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
202 byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
204 byte [] ignorableFlags = new byte [char.MaxValue + 1];
206 double [] unicodeAge = new double [char.MaxValue + 1];
208 void Run (string [] args)
210 string dirname = args.Length == 0 ? "downloaded" : args [0];
213 ParseSources (dirname);
214 Console.Error.WriteLine ("parse done.");
216 FillSecondaryValues ();
218 Console.Error.WriteLine ("generation done.");
220 Console.Error.WriteLine ("serialization done.");
226 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
227 for (int i = 0; i <= char.MaxValue; i++) {
228 byte value = ignorableFlags [i];
230 Result.Write ("{0},", value);
232 Result.Write ("0x{0:X02},", value);
233 if ((i & 0xF) == 0xF)
234 Result.WriteLine ("// {0:X04}", i - 0xF);
236 Result.WriteLine ("};");
240 Result.WriteLine ("static byte [] categories = new byte [] {");
241 for (int i = 0; i < map.Length; i++) {
242 byte value = map [i].Category;
244 Result.Write ("{0},", value);
246 Result.Write ("0x{0:X02},", value);
247 if ((i & 0xF) == 0xF)
248 Result.WriteLine ("// {0:X04}", i - 0xF);
250 Result.WriteLine ("};");
253 // Primary weight value
254 Result.WriteLine ("static byte [] level1 = new byte [] {");
255 for (int i = 0; i < map.Length; i++) {
256 byte value = map [i].Level1;
258 Result.Write ("{0},", value);
260 Result.Write ("0x{0:X02},", value);
261 if ((i & 0xF) == 0xF)
262 Result.WriteLine ("// {0:X04}", i - 0xF);
264 Result.WriteLine ("};");
268 Result.WriteLine ("static byte [] level2 = new byte [] {");
269 for (int i = 0; i < map.Length; i++) {
270 int value = map [i].Level2;
272 Result.Write ("{0},", value);
274 Result.Write ("0x{0:X02},", value);
275 if ((i & 0xF) == 0xF)
276 Result.WriteLine ("// {0:X04}", i - 0xF);
278 Result.WriteLine ("};");
282 Result.WriteLine ("static byte [] level3 = new byte [] {");
283 for (int i = 0; i < map.Length; i++) {
284 byte value = ComputeLevel3Weight ((char) i);
286 Result.Write ("{0},", value);
288 Result.Write ("0x{0:X02},", value);
289 if ((i & 0xF) == 0xF)
290 Result.WriteLine ("// {0:X04}", i - 0xF);
292 Result.WriteLine ("};");
295 // Width insensitivity mappings
296 // (for now it is more lightweight than dumping the
297 // entire NFKD table).
298 Result.WriteLine ("static int [] widthCompat = new int [] {");
299 for (int i = 0; i < char.MaxValue; i++) {
301 switch (decompType [i]) {
302 case DecompositionNarrow:
303 case DecompositionWide:
304 case DecompositionSuper:
305 case DecompositionSub:
306 // they are always 1 char
307 value = decompValues [decompIndex [i]];
311 Result.Write ("{0},", value);
313 Result.Write ("0x{0:X04},", value);
314 if ((i & 0xF) == 0xF)
315 Result.WriteLine ("// {0:X04}", i - 0xF);
317 Result.WriteLine ("};");
321 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
322 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
323 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
324 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
325 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
328 void SerializeCJK (string name, ushort [] cjk, int max)
330 int offset = char.MaxValue - cjk.Length;
331 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
332 for (int i = 0; i < cjk.Length; i++) {
333 if (i + offset == max)
335 ushort value = cjk [i];
337 Result.Write ("{0},", value);
339 Result.Write ("0x{0:X04},", value);
340 if ((i & 0xF) == 0xF)
341 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
343 Result.WriteLine ("};");
347 void SerializeCJK (string name, byte [] cjk, int max)
349 int offset = char.MaxValue - cjk.Length;
350 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
351 for (int i = 0; i < cjk.Length; i++) {
352 if (i + offset == max)
354 byte value = cjk [i];
356 Result.Write ("{0},", value);
358 Result.Write ("0x{0:X02},", value);
359 if ((i & 0xF) == 0xF)
360 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
362 Result.WriteLine ("};");
368 void ParseSources (string dirname)
371 dirname + "/UnicodeData.txt";
372 string derivedCoreProps =
373 dirname + "/DerivedCoreProperties.txt";
375 dirname + "/Scripts.txt";
377 dirname + "/CP932.TXT";
379 dirname + "/DerivedAge.txt";
380 string chXML = dirname + "/common/collation/zh.xml";
381 string jaXML = dirname + "/common/collation/ja.xml";
382 string koXML = dirname + "/common/collation/ko.xml";
384 ParseDerivedAge (derivedAge);
385 ParseJISOrder (cp932); // in prior to ParseUnidata()
386 ParseUnidata (unidata);
387 ParseDerivedCoreProperties (derivedCoreProps);
388 ParseScripts (scripts);
389 ParseCJK (chXML, jaXML, koXML);
392 void ParseDerivedAge (string filename)
394 using (StreamReader file =
395 new StreamReader (filename)) {
396 while (file.Peek () >= 0) {
397 string s = file.ReadLine ();
398 int idx = s.IndexOf ('#');
400 s = s.Substring (0, idx);
401 idx = s.IndexOf (';');
405 string cpspec = s.Substring (0, idx);
406 idx = cpspec.IndexOf ("..");
407 NumberStyles nf = NumberStyles.HexNumber |
408 NumberStyles.AllowTrailingWhite;
409 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
410 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
411 string value = s.Substring (cpspec.Length + 1).Trim ();
414 if (cp > char.MaxValue)
417 for (int i = cp; i <= cpEnd; i++)
418 unicodeAge [i] = double.Parse (value);
423 void ParseUnidata (string filename)
425 ArrayList decompValues = new ArrayList ();
426 using (StreamReader unidata =
427 new StreamReader (filename)) {
428 for (int line = 1; unidata.Peek () >= 0; line++) {
430 ProcessUnidataLine (unidata.ReadLine (), decompValues);
431 } catch (Exception) {
432 Console.Error.WriteLine ("**** At line " + line);
437 this.decompValues = (int [])
438 decompValues.ToArray (typeof (int));
441 void ProcessUnidataLine (string s, ArrayList decompValues)
443 int idx = s.IndexOf ('#');
445 s = s.Substring (0, idx);
446 idx = s.IndexOf (';');
449 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
450 string [] values = s.Substring (idx + 1).Split (';');
453 if (cp > char.MaxValue)
455 if (IsIgnorable (cp))
459 if (s.IndexOf ("SMALL CAPITAL") > 0)
460 isSmallCapital [cp] = true;
462 // latin mapping by character name
463 if (s.IndexOf ("LATIN") > 0) {
464 int lidx = s.IndexOf ("LETTER DOTLESS ");
465 int offset = lidx + 15;
467 lidx = s.IndexOf ("LETTER TURNED ");
471 lidx = s.IndexOf ("LETTER ");
474 char c = lidx > 0 ? s [offset] : char.MinValue;
475 if ('A' <= c && c <= 'Z' &&
476 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
477 ArrayList entry = (ArrayList) latinMap [c];
479 entry = new ArrayList ();
480 latinMap [c] = entry;
486 // diacritical weights by character name
487 for (int d = 0; d < diacritics.Length; d++)
488 if (s.IndexOf (diacritics [d]) > 0)
489 diacritical [cp] |= diacriticWeights [d];
490 // Two-step grep required for it.
491 if (s.IndexOf ("FULL STOP") > 0 &&
492 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
493 diacritical [cp] |= 0xF4;
495 // Arabic letter name
496 if (0x0621 <= cp && cp <= 0x064A &&
497 Char.GetUnicodeCategory ((char) cp)
498 == UnicodeCategory.OtherLetter) {
499 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
504 // hamza, waw, yeh ... special cases.
509 value = 0x77; // special cases.
512 // Get primary letter name i.e.
513 // XXX part of ARABIC LETTER XXX yyy
514 // e.g. that of "TEH MARBUTA" is "TEH".
517 // 0x0640 is special: it does
518 // not start with ARABIC LETTER
520 values [0].Substring (14);
521 int tmpIdx = letterName.IndexOf (' ');
522 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
523 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
524 if (arabicNameMap.ContainsKey (letterName))
525 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
527 arabicNameMap [letterName] = cp;
530 arabicLetterPrimaryValues [cp] = value;
533 // Japanese square letter
534 if (0x3300 <= cp && cp <= 0x3357)
536 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
539 string decomp = values [4];
540 idx = decomp.IndexOf ('<');
542 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
544 decompType [cp] = DecompositionFull;
547 decompType [cp] = DecompositionSub;
550 decompType [cp] = DecompositionSuper;
553 decompType [cp] = DecompositionSmall;
556 decompType [cp] = DecompositionIsolated;
559 decompType [cp] = DecompositionInitial;
562 decompType [cp] = DecompositionFinal;
565 decompType [cp] = DecompositionMedial;
568 decompType [cp] = DecompositionNoBreak;
571 decompType [cp] = DecompositionCompat;
574 decompType [cp] = DecompositionFraction;
577 decompType [cp] = DecompositionFont;
580 decompType [cp] = DecompositionCircle;
583 decompType [cp] = DecompositionSquare;
586 decompType [cp] = DecompositionWide;
589 decompType [cp] = DecompositionNarrow;
592 decompType [cp] = DecompositionVertical;
595 throw new Exception ("Support NFKD type : " + decomp);
598 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
599 if (decomp.Length > 0) {
601 string [] velems = decomp.Split (' ');
602 int didx = decompValues.Count;
603 decompIndex [cp] = didx;
604 foreach (string v in velems)
605 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
606 decompLength [cp] = velems.Length;
608 // [decmpType] -> this_cp
609 int targetCP = (int) decompValues [didx];
610 // for "(x)" it specially maps to 'x' .
611 // FIXME: check if it is sane
612 if (velems.Length == 3 &&
613 (int) decompValues [didx] == '(' &&
614 (int) decompValues [didx + 2] == ')')
615 targetCP = (int) decompValues [didx + 1];
616 // special: 0x215F "1/"
617 else if (cp == 0x215F)
619 else if (velems.Length > 1 &&
620 (targetCP < 0x4C00 || 0x9FBB < targetCP))
621 // skip them, except for CJK ideograph compat
625 Hashtable entry = (Hashtable) nfkdMap [targetCP];
627 entry = new Hashtable ();
628 nfkdMap [targetCP] = entry;
630 entry [(byte) decompType [cp]] = cp;
634 if (values [5].Length > 0)
635 decimalValue [cp] = decimal.Parse (values [5]);
636 else if (values [6].Length > 0)
637 decimalValue [cp] = decimal.Parse (values [6]);
638 else if (values [7].Length > 0) {
639 string decstr = values [7];
640 idx = decstr.IndexOf ('/');
641 if (cp == 0x215F) // special. "1/"
642 decimalValue [cp] = 0x1;
646 decimal.Parse (decstr.Substring (0, idx))
647 / decimal.Parse (decstr.Substring (idx + 1));
648 else if (decstr [0] == '(' &&
649 decstr [decstr.Length - 1] == ')')
652 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
653 else if (decstr [decstr.Length - 1] == '.')
656 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
658 decimalValue [cp] = decimal.Parse (decstr);
662 void ParseDerivedCoreProperties (string filename)
665 using (StreamReader file =
666 new StreamReader (filename)) {
667 for (int line = 1; file.Peek () >= 0; line++) {
669 ProcessDerivedCorePropLine (file.ReadLine ());
670 } catch (Exception) {
671 Console.Error.WriteLine ("**** At line " + line);
678 void ProcessDerivedCorePropLine (string s)
680 int idx = s.IndexOf ('#');
682 s = s.Substring (0, idx);
683 idx = s.IndexOf (';');
686 string cpspec = s.Substring (0, idx);
687 idx = cpspec.IndexOf ("..");
688 NumberStyles nf = NumberStyles.HexNumber |
689 NumberStyles.AllowTrailingWhite;
690 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
691 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
692 string value = s.Substring (cpspec.Length + 1).Trim ();
695 if (cp > char.MaxValue)
700 for (int x = cp; x <= cpEnd; x++)
701 isUppercase [x] = true;
706 void ParseScripts (string filename)
708 ArrayList cyrillic = new ArrayList ();
709 ArrayList gurmukhi = new ArrayList ();
710 ArrayList gujarati = new ArrayList ();
711 ArrayList georgian = new ArrayList ();
712 ArrayList thaana = new ArrayList ();
714 using (StreamReader file =
715 new StreamReader (filename)) {
716 while (file.Peek () >= 0) {
717 string s = file.ReadLine ();
718 int idx = s.IndexOf ('#');
720 s = s.Substring (0, idx);
721 idx = s.IndexOf (';');
725 string cpspec = s.Substring (0, idx);
726 idx = cpspec.IndexOf ("..");
727 NumberStyles nf = NumberStyles.HexNumber |
728 NumberStyles.AllowTrailingWhite;
729 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
730 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
731 string value = s.Substring (cpspec.Length + 1).Trim ();
734 if (cp > char.MaxValue)
739 for (int x = cp; x <= cpEnd; x++)
740 if (!IsIgnorable (x))
741 cyrillic.Add ((char) x);
744 for (int x = cp; x <= cpEnd; x++)
745 if (!IsIgnorable (x))
746 gurmukhi.Add ((char) x);
749 for (int x = cp; x <= cpEnd; x++)
750 if (!IsIgnorable (x))
751 gujarati.Add ((char) x);
754 for (int x = cp; x <= cpEnd; x++)
755 if (!IsIgnorable (x))
756 georgian.Add ((char) x);
759 for (int x = cp; x <= cpEnd; x++)
760 if (!IsIgnorable (x))
761 thaana.Add ((char) x);
766 cyrillic.Sort (UCAComparer.Instance);
767 gurmukhi.Sort (UCAComparer.Instance);
768 gujarati.Sort (UCAComparer.Instance);
769 georgian.Sort (UCAComparer.Instance);
770 thaana.Sort (UCAComparer.Instance);
771 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
772 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
773 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
774 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
775 orderedThaana = (char []) thaana.ToArray (typeof (char));
778 void ParseJISOrder (string filename)
780 using (StreamReader file =
781 new StreamReader (filename)) {
782 while (file.Peek () >= 0) {
783 string s = file.ReadLine ();
784 int idx = s.IndexOf ('#');
786 s = s.Substring (0, idx).Trim ();
789 idx = s.IndexOf (' ');
792 // They start with "0x" so cut them out.
793 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
794 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
795 jisJapanese.Add (new JISCharacter (cp, jis));
800 void ParseCJK (string zhXML, string jaXML, string koXML)
802 XmlDocument doc = new XmlDocument ();
803 doc.XmlResolver = null;
810 // Chinese Simplified
813 offset = char.MaxValue - arr.Length;
815 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
817 foreach (char c in s) {
819 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
821 arr [(int) c - offset] = (ushort) v++;
827 // Chinese Traditional
830 offset = char.MaxValue - arr.Length;
831 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
833 foreach (char c in s) {
835 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
837 arr [(int) c - offset] = (ushort) v++;
846 offset = char.MaxValue - arr.Length;
848 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
850 foreach (char c in s) {
852 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
854 arr [(int) c - offset] = (ushort) v++;
861 // Korean weight is somewhat complex. It first shifts
862 // Hangul category from 52-x to 80-x (they are anyways
863 // computed). CJK ideographs are placed at secondary
864 // weight, like XX YY 01 zz 01, where XX and YY are
865 // corresponding "reset" value and zz is 41,43,45...
867 // Unlike chs,cht and ja, Korean value is a combined
868 // ushort which is computed as category
872 offset = char.MaxValue - arr.Length;
874 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
875 XmlElement sc = (XmlElement) reset.NextSibling;
876 // compute "category" and "level 1" for the
877 // target "reset" Hangle syllable
878 char rc = reset.InnerText [0];
879 int ri = ((int) rc - 0xAC00) + 1;
881 ((ri / 254) * 256 + (ri % 254) + 2);
882 // Place the characters after the target.
885 foreach (char c in s) {
886 arr [(int) c - offset] = p;
887 cjkKOlv2 [(int) c - offset] = (byte) v;
897 void FillIgnorables ()
899 for (int i = 0; i <= char.MaxValue; i++) {
900 if (Char.GetUnicodeCategory ((char) i) ==
901 UnicodeCategory.OtherNotAssigned)
904 ignorableFlags [i] |= 1;
905 if (IsIgnorableSymbol (i))
906 ignorableFlags [i] |= 2;
907 if (IsIgnorableNonSpacing (i))
908 ignorableFlags [i] |= 4;
912 void FillSecondaryValues ()
914 // number, secondary weights
916 int [] numarr = numberSecondaryWeightBounds;
917 for (int i = 0; i < numarr.Length; i += 2, weight++)
918 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
919 if (Char.IsNumber ((char) cp))
920 diacritical [cp] = weight;
927 #region Specially ignored // 01
928 // This will raise "Defined" flag up.
929 foreach (char c in specialIgnore)
930 map [(int) c] = new CharMapEntry (0, 0, 0);
934 #region Variable weights
935 // Controls : 06 03 - 06 3D
937 for (int i = 0; i < 65536; i++) {
941 uc = Char.GetUnicodeCategory (c);
942 // NEL is whitespace but not ignored here.
943 if (uc == UnicodeCategory.Control &&
944 !Char.IsWhiteSpace (c) || c == '\u0085')
945 AddCharMap (c, 6, 1);
949 fillIndex [6] = 0x80;
950 AddCharMapGroup ('\'', 6, 1, 0);
951 AddCharMap ('\uFE63', 6, 1);
953 // Hyphen/Dash : 06 81 - 06 90
954 for (int i = 0; i < char.MaxValue; i++) {
955 if (Char.GetUnicodeCategory ((char) i)
956 == UnicodeCategory.DashPunctuation)
957 AddCharMapGroupTail ((char) i, 6, 1);
960 // Arabic variable weight chars 06 A0 -
961 fillIndex [6] = 0xA0;
963 for (int i = 0x64B; i <= 0x650; i++)
964 AddCharMapGroupTail ((char) i, 6, 1);
966 AddCharMapGroup ('\u0652', 6, 1, 0);
968 AddCharMapGroup ('\u0651', 6, 1, 0);
972 #region Nonspacing marks // 01
973 // FIXME: 01 03 - 01 B6 ... annoyance :(
975 // Combining diacritical marks: 01 DC -
977 fillIndex [0x1] = 0x41;
978 for (int i = 0x030E; i <= 0x0326; i++)
979 if (!IsIgnorable (i))
980 AddCharMap ((char) i, 0x1, 1);
981 for (int i = 0x0329; i <= 0x0334; i++)
982 if (!IsIgnorable (i))
983 AddCharMap ((char) i, 0x1, 1);
984 for (int i = 0x0339; i <= 0x0341; i++)
985 if (!IsIgnorable (i))
986 AddCharMap ((char) i, 0x1, 1);
987 fillIndex [0x1] = 0x72;
988 for (int i = 0x0346; i <= 0x0348; i++)
989 if (!IsIgnorable (i))
990 AddCharMap ((char) i, 0x1, 1);
991 for (int i = 0x02BE; i <= 0x02BF; i++)
992 if (!IsIgnorable (i))
993 AddCharMap ((char) i, 0x1, 1);
994 for (int i = 0x02C1; i <= 0x02C5; i++)
995 if (!IsIgnorable (i))
996 AddCharMap ((char) i, 0x1, 1);
997 for (int i = 0x02CE; i <= 0x02CF; i++)
998 if (!IsIgnorable (i))
999 AddCharMap ((char) i, 0x1, 1);
1000 for (int i = 0x02D1; i <= 0x02D3; i++)
1001 if (!IsIgnorable (i))
1002 AddCharMap ((char) i, 0x1, 1);
1003 AddCharMap ('\u02DE', 0x1, 1);
1004 for (int i = 0x02E4; i <= 0x02E9; i++)
1005 if (!IsIgnorable (i))
1006 AddCharMap ((char) i, 0x1, 1);
1008 // LAMESPEC: It should not stop at '\u20E1'. There are
1009 // a few more characters (that however results in
1010 // overflow of level 2 unless we start before 0xDD).
1011 fillIndex [0x1] = 0xDC;
1012 for (int i = 0x20d0; i <= 0x20e1; i++)
1013 AddCharMap ((char) i, 0x1, 1);
1017 #region Whitespaces // 07 03 -
1018 fillIndex [0x7] = 0x2;
1019 AddCharMap (' ', 0x7, 2);
1020 AddCharMap ('\u00A0', 0x7, 1);
1021 for (int i = 9; i <= 0xD; i++)
1022 AddCharMap ((char) i, 0x7, 1);
1023 for (int i = 0x2000; i <= 0x200B; i++)
1024 AddCharMap ((char) i, 0x7, 1);
1026 fillIndex [0x7] = 0x17;
1027 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1028 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1030 // Characters which used to represent layout control.
1031 // LAMESPEC: Windows developers seem to have thought
1032 // that those characters are kind of whitespaces,
1033 // while they aren't.
1034 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1035 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1039 #region ASCII non-alphanumeric + 3001, 3002 // 07
1040 // non-alphanumeric ASCII except for: + - < = > '
1041 for (int i = 0x21; i < 0x7F; i++) {
1042 if (Char.IsLetterOrDigit ((char) i)
1043 || "+-<=>'".IndexOf ((char) i) >= 0)
1044 continue; // they are not added here.
1045 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
1046 // Insert 3001 after ',' and 3002 after '.'
1048 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
1050 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
1052 AddCharMap ('\uFE30', 0x7, 1, 0);
1057 // FIXME: for 07 xx we need more love.
1059 // FIXME: 08 should be more complete.
1060 fillIndex [0x8] = 2;
1061 for (int cp = 0; cp < char.MaxValue; cp++)
1062 if (Char.GetUnicodeCategory ((char) cp) ==
1063 UnicodeCategory.MathSymbol)
1064 AddCharMapGroup2 ((char) cp, 0x8, 1, 0);
1066 // FIXME: implement 09
1068 // FIXME: implement 0A
1070 fillIndex [0xA] = 2;
1071 // byte currency symbols
1072 for (int cp = 0; cp < 0x100; cp++) {
1073 uc = Char.GetUnicodeCategory ((char) cp);
1074 if (uc == UnicodeCategory.CurrencySymbol &&
1076 AddCharMapGroup2 ((char) cp, 0xA, 1, 0);
1078 // byte other symbols
1079 for (int cp = 0; cp < 0x100; cp++) {
1080 uc = Char.GetUnicodeCategory ((char) cp);
1081 if (uc == UnicodeCategory.OtherSymbol)
1082 AddCharMapGroup2 ((char) cp, 0xA, 1, 0);
1086 #region Numbers // 0C 02 - 0C E1
1087 fillIndex [0xC] = 2;
1089 // 9F8 : Bengali "one less than the denominator"
1090 AddCharMap ('\u09F8', 0xC, 1);
1092 ArrayList numbers = new ArrayList ();
1093 for (int i = 0; i < 65536; i++)
1094 if (!IsIgnorable (i) &&
1095 Char.IsNumber ((char) i) &&
1096 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1099 ArrayList numberValues = new ArrayList ();
1100 foreach (int i in numbers)
1101 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1102 numberValues.Sort (DictionaryValueComparer.Instance);
1104 //foreach (DictionaryEntry de in numberValues)
1105 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1107 decimal prevValue = -1;
1108 foreach (DictionaryEntry de in numberValues) {
1109 int cp = (int) de.Key;
1110 decimal currValue = (decimal) de.Value;
1111 bool addnew = false;
1112 if (prevValue < currValue &&
1113 prevValue - (int) prevValue == 0 &&
1117 // Process Hangzhou and Roman numbers
1119 // There are some SPECIAL cases.
1120 if (currValue != 4) // no increment for 4
1124 xcp = (int) prevValue + 0x2170 - 1;
1125 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1126 xcp = (int) prevValue + 0x2160 - 1;
1127 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1128 fillIndex [0xC] += 2;
1129 xcp = (int) prevValue + 0x3021 - 1;
1130 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1133 if (prevValue < currValue)
1134 prevValue = currValue;
1135 if (map [cp].Defined)
1137 // HangZhou and Roman are add later
1139 else if (0x3021 <= cp && cp < 0x302A
1140 || 0x2160 <= cp && cp < 0x216A
1141 || 0x2170 <= cp && cp < 0x217A)
1144 if (cp == 0x215B) // FIXME: why?
1145 fillIndex [0xC] += 2;
1146 else if (cp == 0x3021) // FIXME: why?
1148 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1150 if (addnew || cp <= '9') {
1152 if (1 <= currValue && currValue <= 10) {
1153 xcp = cp - 0x31 + 0x2776;
1154 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1155 xcp = cp - 0x31 + 0x2780;
1156 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1157 xcp = cp - 0x31 + 0x278A;
1158 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1160 if (1 <= currValue && currValue <= 20) {
1161 xcp = cp - 0x31 + 0x2460;
1162 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1163 xcp = cp - 0x31 + 0x2474;
1164 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1165 xcp = cp - 0x31 + 0x2488;
1166 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1170 if (cp != 0x09E7 && cp != 0x09EA)
1173 // Add special cases that are not regarded as
1174 // numbers in UnicodeCategory speak.
1177 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1178 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1180 else if (cp == '6') // FIXME: why?
1185 fillIndex [0xC] = 0xFF;
1186 AddCharMap ('\u221E', 0xC, 1);
1189 #region Letters and NonSpacing Marks (general)
1192 for (int i = 0; i < alphabets.Length; i++)
1193 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1196 fillIndex [0xF] = 02;
1197 for (int i = 0x0380; i < 0x0390; i++)
1198 if (Char.IsLetter ((char) i))
1199 AddLetterMap ((char) i, 0xF, 1);
1200 fillIndex [0xF] = 02;
1201 for (int i = 0x0391; i < 0x03CF; i++)
1202 if (Char.IsLetter ((char) i))
1203 AddLetterMap ((char) i, 0xF, 1);
1204 fillIndex [0xF] = 0x40;
1205 for (int i = 0x03D0; i < 0x0400; i++)
1206 if (Char.IsLetter ((char) i))
1207 AddLetterMap ((char) i, 0xF, 1);
1209 // Cyrillic - UCA order w/ some modification
1210 fillIndex [0x10] = 0x3;
1211 // table which is moslty from UCA DUCET.
1212 for (int i = 0; i < orderedCyrillic.Length; i++) {
1213 char c = orderedCyrillic [i];
1214 if (Char.IsLetter (c))
1215 AddLetterMap (c, 0x10, 3);
1217 for (int i = 0x0460; i < 0x0481; i++) {
1218 if (Char.IsLetter ((char) i))
1219 AddLetterMap ((char) i, 0x10, 3);
1223 fillIndex [0x11] = 0x3;
1224 for (int i = 0x0531; i < 0x0586; i++)
1225 if (Char.IsLetter ((char) i))
1226 AddLetterMap ((char) i, 0x11, 1);
1230 fillIndex [0x12] = 0x3;
1231 for (int i = 0x05D0; i < 0x05FF; i++)
1232 if (Char.IsLetter ((char) i))
1233 AddLetterMap ((char) i, 0x12, 1);
1235 fillIndex [0x1] = 0x3;
1236 for (int i = 0x0591; i <= 0x05C2; i++)
1238 AddCharMap ((char) i, 0x1, 1);
1241 fillIndex [0x1] = 0x8E;
1242 fillIndex [0x13] = 0x3;
1243 for (int i = 0x0621; i <= 0x064A; i++) {
1245 if (Char.GetUnicodeCategory ((char) i)
1246 != UnicodeCategory.OtherLetter) {
1247 // FIXME: arabic nonspacing marks are
1248 // in different order.
1249 AddCharMap ((char) i, 0x1, 1);
1252 map [i] = new CharMapEntry (0x13,
1253 (byte) arabicLetterPrimaryValues [i], 1);
1255 fillIndex [0x13] = 0x84;
1256 for (int i = 0x0674; i < 0x06D6; i++)
1257 if (Char.IsLetter ((char) i))
1258 AddLetterMap ((char) i, 0x13, 1);
1261 // FIXME: it does seem straight codepoint mapping.
1262 fillIndex [0x14] = 04;
1263 for (int i = 0x0901; i < 0x0905; i++)
1264 if (!IsIgnorable (i))
1265 AddLetterMap ((char) i, 0x14, 2);
1266 fillIndex [0x14] = 0xB;
1267 for (int i = 0x0905; i < 0x093A; i++)
1268 if (Char.IsLetter ((char) i))
1269 AddLetterMap ((char) i, 0x14, 4);
1270 for (int i = 0x093E; i < 0x094F; i++)
1271 if (Char.IsLetter ((char) i))
1272 AddLetterMap ((char) i, 0x14, 2);
1276 fillIndex [0x15] = 02;
1277 for (int i = 0x0980; i < 0x9FF; i++) {
1278 if (IsIgnorable (i))
1281 fillIndex [0x15] = 0x3B;
1282 switch (Char.GetUnicodeCategory ((char) i)) {
1283 case UnicodeCategory.NonSpacingMark:
1284 case UnicodeCategory.DecimalDigitNumber:
1285 case UnicodeCategory.OtherNumber:
1288 AddLetterMap ((char) i, 0x15, 1);
1291 fillIndex [0x1] = 0x3;
1292 for (int i = 0x0981; i < 0x0A00; i++)
1293 if (Char.GetUnicodeCategory ((char) i) ==
1294 UnicodeCategory.NonSpacingMark)
1295 AddCharMap ((char) i, 0x1, 1);
1297 // Gurmukhi. orderedGurmukhi is from UCA
1298 // FIXME: it does not look equivalent to UCA.
1299 fillIndex [0x1] = 03;
1300 fillIndex [0x16] = 02;
1301 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1302 char c = orderedGurmukhi [i];
1303 if (IsIgnorable ((int) c))
1305 if (!Char.IsLetter (c)) {
1306 AddLetterMap (c, 0x1, 1);
1309 if (c == '\u0A3C' || c == '\u0A4D' ||
1310 '\u0A66' <= c && c <= '\u0A71')
1312 AddLetterMap (c, 0x16, 4);
1315 // Gujarati. orderedGujarati is from UCA
1316 fillIndex [0x17] = 02;
1317 for (int i = 0; i < orderedGujarati.Length; i++)
1318 AddLetterMap (orderedGujarati [i], 0x17, 4);
1321 fillIndex [0x18] = 02;
1322 for (int i = 0x0B00; i < 0x0B7F; i++) {
1323 switch (Char.GetUnicodeCategory ((char) i)) {
1324 case UnicodeCategory.NonSpacingMark:
1325 case UnicodeCategory.DecimalDigitNumber:
1328 AddLetterMap ((char) i, 0x18, 1);
1332 fillIndex [0x19] = 2;
1333 AddCharMap ('\u0BD7', 0x19, 0);
1334 fillIndex [0x19] = 0xA;
1336 for (int i = 0x0BD7; i < 0x0B94; i++)
1337 if (Char.IsLetter ((char) i))
1338 AddCharMap ((char) i, 0x19, 2);
1340 fillIndex [0x19] = 0x24;
1341 AddCharMap ('\u0B94', 0x19, 0);
1342 fillIndex [0x19] = 0x26;
1343 // The array for Tamil consonants is a constant.
1344 // Windows have almost similar sequence to TAM from
1345 // tamilnet but a bit different in Grantha.
1346 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1347 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1349 fillIndex [0x19] = 0x82;
1350 for (int i = 0x0BBE; i < 0x0BCD; i++)
1351 if (Char.GetUnicodeCategory ((char) i) ==
1352 UnicodeCategory.SpacingCombiningMark
1354 AddLetterMap ((char) i, 0x19, 2);
1357 fillIndex [0x1A] = 0x4;
1358 for (int i = 0x0C00; i < 0x0C62; i++) {
1359 if (i == 0x0C55 || i == 0x0C56)
1361 AddCharMap ((char) i, 0x1A, 3);
1362 char supp = (i == 0x0C0B) ? '\u0C60':
1363 i == 0x0C0C ? '\u0C61' : char.MinValue;
1364 if (supp == char.MinValue)
1366 AddCharMap (supp, 0x1A, 3);
1370 fillIndex [0x1B] = 4;
1371 for (int i = 0x0C80; i < 0x0CE5; i++) {
1372 if (i == 0x0CD5 || i == 0x0CD6)
1374 AddCharMap ((char) i, 0x1B, 3);
1378 fillIndex [0x1C] = 2;
1379 for (int i = 0x0D02; i < 0x0D61; i++)
1380 // FIXME: I avoided MSCompatUnicodeTable usage
1381 // here (it results in recursion). So check if
1382 // using NonSpacingMark makes sense or not.
1383 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1384 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1385 AddCharMap ((char) i, 0x1C, 1);
1387 // Thai ... note that it breaks 0x1E wall after E2B!
1388 // Also, all Thai characters have level 2 value 3.
1389 fillIndex [0x1E] = 2;
1390 for (int i = 0xE44; i < 0xE48; i++)
1391 AddCharMap ((char) i, 0x1E, 1, 3);
1392 for (int i = 0xE01; i < 0xE2B; i++)
1393 AddCharMap ((char) i, 0x1E, 6, 0);
1394 fillIndex [0x1F] = 5;
1395 for (int i = 0xE2B; i < 0xE30; i++)
1396 AddCharMap ((char) i, 0x1F, 6, 0);
1397 for (int i = 0xE30; i < 0xE3B; i++)
1398 AddCharMap ((char) i, 0x1F, 1, 3);
1399 // some Thai characters remains.
1400 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1401 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1402 foreach (char c in specialThai)
1403 AddCharMap (c, 0x1F, 1);
1406 fillIndex [0x1F] = 2;
1407 for (int i = 0xE80; i < 0xEDF; i++)
1408 if (Char.IsLetter ((char) i))
1409 AddCharMap ((char) i, 0x1F, 1);
1411 // Georgian. orderedGeorgian is from UCA DUCET.
1412 fillIndex [0x21] = 5;
1413 for (int i = 0; i < orderedGeorgian.Length; i++)
1414 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1417 fillIndex [0x22] = 2;
1418 int kanaOffset = 0x3041;
1419 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1421 for (int gyo = 0; gyo < 9; gyo++) {
1422 for (int dan = 0; dan < 5; dan++) {
1423 if (gyo == 7 && dan % 2 == 1) {
1426 kanaOffset -= 2; // There is no space for yi and ye.
1429 int cp = kanaOffset + dan * kanaLines [gyo];
1430 // small lines (a-gyo, ya-gyo)
1431 if (gyo == 0 || gyo == 7) {
1432 AddKanaMap (cp, 1); // small
1433 AddKanaMap (cp + 1, 1);
1436 AddKanaMap (cp, kanaLines [gyo]);
1440 // add small 'Tsu' (before normal one)
1441 AddKanaMap (0x3063, 1);
1445 fillIndex [0x22] += 3;
1446 kanaOffset += 5 * kanaLines [gyo];
1449 // Wa-gyo is almost special, so I just manually add.
1450 AddLetterMap ((char) 0x308E, 0x22, 0);
1451 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1452 AddLetterMap ((char) 0x308F, 0x22, 0);
1453 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1455 AddLetterMap ((char) 0x3090, 0x22, 0);
1456 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1457 fillIndex [0x22] += 2;
1458 // no "Wu" in Japanese.
1459 AddLetterMap ((char) 0x3091, 0x22, 0);
1460 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1462 AddLetterMap ((char) 0x3092, 0x22, 0);
1463 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1465 fillIndex [0x22] = 0x80;
1466 AddLetterMap ((char) 0x3093, 0x22, 0);
1467 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1469 // JIS Japanese square chars.
1470 fillIndex [0x22] = 0x97;
1471 jisJapanese.Sort (JISComparer.Instance);
1472 foreach (JISCharacter j in jisJapanese)
1473 AddCharMap ((char) j.CP, 0x22, 1);
1474 // non-JIS Japanese square chars.
1475 nonJisJapanese.Sort (NonJISComparer.Instance);
1476 foreach (NonJISCharacter j in nonJisJapanese)
1477 AddCharMap ((char) j.CP, 0x22, 1);
1480 fillIndex [0x23] = 0x02;
1481 for (int i = 0x3105; i <= 0x312C; i++)
1482 AddCharMap ((char) i, 0x23, 1);
1484 // Estrangela: ancient Syriac
1485 fillIndex [0x24] = 0x0B;
1486 // FIXME: is 0x71E really alternative form?
1487 ArrayList syriacAlternatives = new ArrayList (
1488 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1489 for (int i = 0x0710; i <= 0x072C; i++) {
1490 if (i == 0x0711) // NonSpacingMark
1492 if (syriacAlternatives.Contains (i))
1494 AddCharMap ((char) i, 0x24, 4);
1499 foreach (int cp in syriacAlternatives)
1500 map [cp] = new CharMapEntry (0x24,
1501 (byte) (map [cp - 1].Level1 + 2),
1505 // FIXME: it turned out that it does not look like UCA
1506 fillIndex [0x24] = 0x6E;
1507 for (int i = 0; i < orderedThaana.Length; i++) {
1508 if (IsIgnorableNonSpacing (i))
1510 AddCharMap (orderedThaana [i], 0x24, 2);
1514 #region Level2 adjustment
1516 diacritical [0x624] = 0x5;
1517 diacritical [0x626] = 0x7;
1518 diacritical [0x622] = 0x9;
1519 diacritical [0x623] = 0xA;
1520 diacritical [0x625] = 0xB;
1521 diacritical [0x649] = 0x5; // 'alif maqs.uurah
1522 diacritical [0x64A] = 0x7; // Yaa'
1525 for (int i = 0; i < 0x10000; i++) {
1526 switch (map [i].Category) {
1527 case 0xE: // Latin diacritics
1528 case 0x22: // Japanese: circled characters
1529 map [i] = new CharMapEntry (
1534 case 0x13: // Arabic
1535 if (diacritical [i] == 0)
1537 diacritical [i] = 0x8;
1538 map [i] = new CharMapEntry (0xE, map [i].Level1, diacritical [i]);
1544 // FIXME: Add more culture-specific letters (that are
1545 // not supported in Windows collation) here.
1547 // Surrogate ... they are computed.
1551 // Unlike UCA Windows Hangul sequence mixes Jongseong
1552 // with Choseong sequence as well as Jungseong,
1553 // adjusted to have the same primary weight for the
1554 // same base character. So it is impossible to compute
1557 // Here I introduce an ordered sequence of mixed
1558 // 'commands' and 'characters' that is similar to
1560 // - ',' increases primary weight.
1561 // - [A B] means a range, increasing index
1562 // - {A B} means a range, without increasing index
1563 // - '=' is no operation (it means the characters
1564 // of both sides have the same weight).
1565 // - '>' inserts a Hangul Syllable block that
1566 // contains 0x251 characters.
1567 // - '<' decreases the index
1568 // - '0'-'9' means skip count
1569 // - whitespaces are ignored
1572 string hangulSequence =
1573 + "\u1100=\u11A8 > \u1101=\u11A9 >"
1574 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1575 + "<{\u1113 \u1116}, \u3165,"
1576 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1577 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
1578 + "\u11CA, \u1104, \u11CB > \u1105 >"
1579 + "\u11B0, [\u11CC \u11D0], \u11B1, [\u11D1 \u11D2],"
1580 + "\u11B2, [\u11D3 \u11D5], \u11B3,"
1581 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
1582 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
1583 + "[\u11DA \u11E2], \u1107=\u11B8 >"
1584 + "<{\u111E \u1120}, \u3172,, \u3173, "
1585 + "\u11E3, \u1108 >"
1586 + "\u11B9,,,,,,,,, [\u11E4 \u11E6],, \u1109=\u11BA,,,"
1587 + "\u3214=\u3274 <>"
1588 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
1589 + "\u11EA,, \u110A=\u11BB,,, >"
1590 + "{\u1134 \u1140}, \u317E,,,,,, \u11EB,"
1591 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
1592 + "\u11EE, \u11EC, \u11ED,,,,, \u11F1,, \u11F2,,,"
1593 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
1595 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
1596 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
1597 + "\u1110=\u11C0 > \u1111=\u11C1 >"
1598 + "\u11F3, \u11F4, \u1112=\u11C2 >"
1599 + "\u11F9, [\u11F5 \u11F8]"
1602 byte hangulCat = 0x52;
1603 fillIndex [hangulCat] = 0x2;
1605 int syllableBlock = 0;
1606 for (int n = 0; n < hangulSequence.Length; n++) {
1607 char c = hangulSequence [n];
1609 if (Char.IsWhiteSpace (c))
1615 IncrementSequentialIndex (ref hangulCat);
1618 if (fillIndex [hangulCat] == 2)
1619 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
1620 fillIndex [hangulCat]--;
1623 IncrementSequentialIndex (ref hangulCat);
1624 for (int l = 0; l < 0x15; l++)
1625 for (int v = 0; v < 0x1C; v++) {
1627 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
1628 IncrementSequentialIndex (ref hangulCat);
1633 start = hangulSequence [n + 1];
1634 end = hangulSequence [n + 3];
1635 for (int i = start; i <= end; i++) {
1636 AddCharMap ((char) i, hangulCat, 0);
1638 IncrementSequentialIndex (ref hangulCat);
1640 n += 4; // consumes 5 characters for this operation
1643 start = hangulSequence [n + 1];
1644 end = hangulSequence [n + 3];
1645 for (int i = start; i <= end; i++)
1646 AddCharMap ((char) i, hangulCat, 0);
1647 n += 4; // consumes 5 characters for this operation
1650 AddCharMap (c, hangulCat, 0);
1655 // CJK unified ideograph.
1657 fillIndex [cjkCat] = 0x2;
1658 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
1659 if (!IsIgnorable (cp))
1660 AddCharMapGroupCJK ((char) cp, ref cjkCat);
1661 // CJK Extensions goes here.
1662 // LAMESPEC: With this Windows style CJK layout, it is
1663 // impossible to add more CJK ideograph i.e. 0x9FA6-
1664 // 0x9FBB can never be added w/o breaking compat.
1665 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
1666 if (!IsIgnorable (cp))
1667 AddCharMapGroupCJK ((char) cp, ref cjkCat);
1669 // PrivateUse ... computed.
1670 // remaining Surrogate ... computed.
1672 #region Special "biggest" area (FF FF)
1673 fillIndex [0xFF] = 0xFF;
1674 char [] specialBiggest = new char [] {
1675 '\u3005', '\u3031', '\u3032', '\u309D',
1676 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1677 '\uFE7C', '\uFE7D', '\uFF70'};
1678 foreach (char c in specialBiggest)
1679 AddCharMap (c, 0xFF, 0);
1682 // Characters w/ diacritical marks (NFKD)
1683 for (int i = 0; i <= char.MaxValue; i++) {
1684 if (map [i].Defined || IsIgnorable (i))
1686 if (decompIndex [i] == 0)
1689 int start = decompIndex [i];
1690 int primaryChar = decompValues [start];
1691 if (map [primaryChar].Level1 == 0)
1695 for (int l = 1; l < decompLength [i]; l++) {
1696 int c = decompValues [start + l];
1697 if (map [c].Level1 != 0)
1699 secondary += diacritical [c];
1703 map [i] = new CharMapEntry (
1704 map [primaryChar].Category,
1705 map [primaryChar].Level1,
1711 // FIXME: this is hack but those which are
1712 // NonSpacingMark characters and still undefined
1713 // are likely to be nonspacing.
1714 for (int i = 0; i < char.MaxValue; i++)
1715 if (!map [i].Defined &&
1717 Char.GetUnicodeCategory ((char) i) ==
1718 UnicodeCategory.NonSpacingMark)
1719 AddCharMap ((char) i, 1, 1);
1722 private void IncrementSequentialIndex (ref byte hangulCat)
1724 fillIndex [hangulCat]++;
1725 if (fillIndex [hangulCat] == 0) { // overflown
1727 fillIndex [hangulCat] = 0x2;
1731 // Reset fillIndex to fixed value and call AddLetterMap().
1732 private void AddAlphaMap (char c, byte category, byte alphaWeight)
1734 fillIndex [category] = alphaWeight;
1735 AddLetterMap (c, category, 0);
1737 ArrayList al = latinMap [c] as ArrayList;
1741 //Console.Error.WriteLine ("PROCESSING {0}: {1} entries", c, al.Count);
1742 //foreach (int cp in al) Console.Error.WriteLine (" {0:X04}", cp);
1743 foreach (int cp in al)
1744 AddLetterMap ((char) cp, category, 0);
1747 private void AddKanaMap (int i, byte voices)
1749 for (byte b = 0; b < voices; b++) {
1750 char c = (char) (i + b);
1751 byte arg = (byte) (b > 0 ? b + 2 : 0);
1753 AddLetterMapCore (c, 0x22, 0, arg);
1755 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
1759 private void AddLetterMap (char c, byte category, byte updateCount)
1761 AddLetterMapCore (c, category, updateCount, 0);
1764 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
1767 // <small> updates index
1768 c2 = ToSmallForm (c);
1770 AddCharMapGroup2 (c2, category, updateCount, level2);
1771 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
1772 if (c2 != c && !map [(int) c2].Defined)
1773 AddLetterMapCore (c2, category, 0, level2);
1774 bool doUpdate = true;
1775 if (!map [c].Defined)
1776 AddCharMapGroup2 (c, category, 0, level2);
1780 fillIndex [category] += updateCount;
1783 private void AddCharMap (char c, byte category, byte increment)
1785 AddCharMap (c, category, increment, 0);
1788 private void AddCharMap (char c, byte category, byte increment, byte alt)
1790 if (IsIgnorable ((int) c) || map [(int) c].Defined) {
1791 return; // do nothing
1794 map [(int) c] = new CharMapEntry (category,
1795 category == 1 ? alt : fillIndex [category],
1796 category == 1 ? fillIndex [category] : alt);
1797 fillIndex [category] += increment;
1800 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
1802 char c2 = ToSmallFormTail (c);
1804 AddCharMap (c2, category, updateCount, 0);
1806 AddCharMap (c, category, updateCount, 0);
1808 c2 = ToFullWidthTail (c);
1810 AddCharMapGroupTail (c2, category, updateCount);
1814 // Adds characters to table in the order below
1815 // (+ increases weight):
1819 // <full> | <super> | <sub>
1820 // <circle> | <wide> (| <narrow>)
1824 // level2 is fixed (does not increase).
1825 int [] sameWeightItems = new int [] {
1826 0, // canonically compatible
1827 DecompositionFraction,
1831 DecompositionCircle,
1833 DecompositionNarrow,
1835 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
1837 char small = char.MinValue;
1838 char vertical = char.MinValue;
1839 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
1841 object smv = nfkd [(byte) DecompositionSmall];
1843 small = (char) ((int) smv);
1844 object vv = nfkd [(byte) DecompositionVertical];
1846 vertical = (char) ((int) vv);
1849 // <small> updates index
1850 if (small != char.MinValue)
1851 AddCharMap (small, category, updateCount);
1854 AddCharMap (c, category, 0, level2);
1857 foreach (int weight in sameWeightItems) {
1858 object wv = nfkd [(byte) weight];
1860 AddCharMap ((char) ((int) wv), category, 0, level2);
1864 // update index here.
1865 fillIndex [category] += updateCount;
1867 if (vertical != char.MinValue)
1868 AddCharMap (vertical, category, updateCount, level2);
1871 private void AddCharMapCJK (char c, ref byte category)
1873 AddCharMap (c, category, 0, 0);
1874 IncrementSequentialIndex (ref category);
1876 // Special. I wonder why but Windows skips 9E F9.
1877 if (category == 0x9E && fillIndex [category] == 0xF9)
1878 IncrementSequentialIndex (ref category);
1881 private void AddCharMapGroupCJK (char c, ref byte category)
1883 AddCharMapCJK (c, ref category);
1885 // LAMESPEC: see below.
1886 if (c == '\u52DE') {
1887 AddCharMapCJK ('\u3298', ref category);
1888 AddCharMapCJK ('\u3238', ref category);
1891 AddCharMapCJK ('\u32A2', ref category);
1893 // Especially this mapping order totally does
1894 // not make sense to me.
1895 AddCharMapCJK ('\u32A9', ref category);
1897 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
1900 for (byte weight = 0; weight <= 17; weight++) {
1901 object wv = nfkd [weight];
1906 // Special: they are ignored in this area.
1907 // FIXME: check if it is sane
1908 if (0xF900 <= w && w <= 0xFAD9)
1910 // LAMESPEC: on Windows some of CJK characters
1911 // in 3200-32B0 are incorrectly mapped. They
1912 // mix Chinise and Japanese Kanji when
1913 // ordering those characters.
1915 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
1919 AddCharMapCJK ((char) w, ref category);
1923 // note that level2 is fixed
1924 // different order than AddCharMapGroup2()
1925 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
1929 AddCharMap (c, category, updateCount, level2);
1931 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
1935 // Here type of i must be byte since the constants
1936 // are stored as byte.
1937 for (byte i = 1; i <= 17; i++) {
1938 if (nfkd.ContainsKey (i)) {
1939 int cp = (int) nfkd [i];
1940 if (decompLength [cp] == 1) {
1941 AddCharMapGroup ((char) cp, category, updateCount, level2);
1946 AddCharMapGroup2 (c, category, updateCount, level2);
1949 char ToFullWidth (char c)
1951 return ToDecomposed (c, DecompositionFull, false);
1954 char ToFullWidthTail (char c)
1956 return ToDecomposed (c, DecompositionFull, true);
1959 char ToSmallForm (char c)
1961 return ToDecomposed (c, DecompositionSmall, false);
1964 char ToSmallFormTail (char c)
1966 return ToDecomposed (c, DecompositionSmall, true);
1969 char ToDecomposed (char c, byte d, bool tail)
1971 if (decompType [(int) c] != d)
1973 int idx = decompIndex [(int) c];
1975 idx += decompLength [(int) c] - 1;
1976 return (char) decompValues [idx];
1979 bool ExistsJIS (int cp)
1981 foreach (JISCharacter j in jisJapanese)
1989 #region Level 3 properties (Case/Width)
1991 private byte ComputeLevel3Weight (char c)
1993 byte b = ComputeLevel3WeightRaw (c);
1994 return b > 0 ? (byte) (b + 2) : b;
1997 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2000 if ('\u11A8' <= c && c <= '\u11F9')
2002 if ('\uFFA0' <= c && c <= '\uFFDC')
2004 if ('\u3130' <= c && c <= '\u3164')
2007 if ('\u2776' <= c && c <= '\u277F')
2009 if ('\u2780' <= c && c <= '\u2789')
2011 if ('\u2776' <= c && c <= '\u2793')
2013 if ('\u2160' <= c && c <= '\u216F')
2015 if ('\u2181' <= c && c <= '\u2182')
2018 if ('\u2135' <= c && c <= '\u2138')
2020 if ('\uFE80' <= c && c < '\uFE8E') {
2021 // 2(Isolated)/8(Final)/0x18(Medial)
2022 switch (decompType [(int) c]) {
2023 case DecompositionIsolated:
2025 case DecompositionFinal:
2027 case DecompositionMedial:
2032 // actually I dunno the reason why they have weights.
2055 switch (decompType [(int) c]) {
2056 case DecompositionWide: // <wide>
2057 case DecompositionSub: // <sub>
2058 case DecompositionSuper: // <super>
2059 ret |= decompType [(int) c];
2062 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2064 if (isUppercase [(int) c]) // DerivedCoreProperties
2073 // FIXME: In the future use DerivedAge.txt to examine character
2074 // versions and set those ones that have higher version than
2075 // 1.0 as ignorable.
2076 static bool IsIgnorable (int i)
2080 // I guess, those characters are added between
2081 // Unicode 1.0 (LCMapString) and Unicode 3.1
2082 // (UnicodeCategory), so they used to be
2083 // something like OtherNotAssigned as of Unicode 1.1.
2084 case 0x2df: case 0x387:
2085 case 0x3d7: case 0x3d8: case 0x3d9:
2086 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2087 case 0x400: case 0x40d: case 0x450: case 0x45d:
2088 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2089 case 0x653: case 0x654: case 0x655: case 0x66d:
2091 case 0x1e9b: case 0x202f: case 0x20ad:
2092 case 0x20ae: case 0x20af:
2093 case 0x20e2: case 0x20e3:
2094 case 0x2139: case 0x213a: case 0x2183:
2095 case 0x2425: case 0x2426: case 0x2619:
2096 case 0x2670: case 0x2671: case 0x3007:
2097 case 0x3190: case 0x3191:
2098 case 0xfffc: case 0xfffd:
2100 // exceptional characters filtered by the
2101 // following conditions. Originally those exceptional
2102 // ranges are incorrect (they should not be ignored)
2103 // and most of those characters are unfortunately in
2105 case 0x4d8: case 0x4d9:
2106 case 0x4e8: case 0x4e9:
2107 case 0x3036: case 0x303f:
2108 case 0x337b: case 0xfb1e:
2113 // The whole Sinhala characters.
2114 0x0D82 <= i && i <= 0x0DF4
2115 // The whole Tibetan characters.
2116 || 0x0F00 <= i && i <= 0x0FD1
2117 // The whole Myanmar characters.
2118 || 0x1000 <= i && i <= 0x1059
2119 // The whole Etiopic, Cherokee,
2120 // Canadian Syllablic, Ogham, Runic,
2121 // Tagalog, Hanunoo, Philippine,
2122 // Buhid, Tagbanwa, Khmer and Mongorian
2124 || 0x1200 <= i && i <= 0x1DFF
2125 // Greek extension characters.
2126 || 0x1F00 <= i && i <= 0x1FFF
2127 // The whole Braille characters.
2128 || 0x2800 <= i && i <= 0x28FF
2129 // CJK radical characters.
2130 || 0x2E80 <= i && i <= 0x2EF3
2131 // Kangxi radical characters.
2132 || 0x2F00 <= i && i <= 0x2FD5
2133 // Ideographic description characters.
2134 || 0x2FF0 <= i && i <= 0x2FFB
2135 // Bopomofo letter and final
2136 || 0x31A0 <= i && i <= 0x31B7
2137 // White square with quadrant characters.
2138 || 0x25F0 <= i && i <= 0x25F7
2139 // Ideographic telegraph symbols.
2140 || 0x32C0 <= i && i <= 0x32CB
2141 || 0x3358 <= i && i <= 0x3370
2142 || 0x33E0 <= i && i <= 0x33FF
2143 // The whole YI characters.
2144 || 0xA000 <= i && i <= 0xA48C
2145 || 0xA490 <= i && i <= 0xA4C6
2146 // American small ligatures
2147 || 0xFB13 <= i && i <= 0xFB17
2148 // hebrew, arabic, variation selector.
2149 || 0xFB1D <= i && i <= 0xFE2F
2150 // Arabic ligatures.
2151 || 0xFEF5 <= i && i <= 0xFEFC
2152 // FIXME: why are they excluded?
2153 || 0x01F6 <= i && i <= 0x01F9
2154 || 0x0218 <= i && i <= 0x0233
2155 || 0x02A9 <= i && i <= 0x02AD
2156 || 0x02EA <= i && i <= 0x02EE
2157 || 0x0349 <= i && i <= 0x036F
2158 || 0x0488 <= i && i <= 0x048F
2159 || 0x04D0 <= i && i <= 0x04FF
2160 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2161 || 0x06D6 <= i && i <= 0x06ED
2162 || 0x06FA <= i && i <= 0x06FE
2163 || 0x2048 <= i && i <= 0x204D
2164 || 0x20e4 <= i && i <= 0x20ea
2165 || 0x213C <= i && i <= 0x214B
2166 || 0x21EB <= i && i <= 0x21FF
2167 || 0x22F2 <= i && i <= 0x22FF
2168 || 0x237B <= i && i <= 0x239A
2169 || 0x239B <= i && i <= 0x23CF
2170 || 0x24EB <= i && i <= 0x24FF
2171 || 0x2596 <= i && i <= 0x259F
2172 || 0x25F8 <= i && i <= 0x25FF
2173 || 0x2672 <= i && i <= 0x2689
2174 || 0x2768 <= i && i <= 0x2775
2175 || 0x27d0 <= i && i <= 0x27ff
2176 || 0x2900 <= i && i <= 0x2aff
2177 || 0x3033 <= i && i <= 0x303F
2178 || 0x31F0 <= i && i <= 0x31FF
2179 || 0x3250 <= i && i <= 0x325F
2180 || 0x32B1 <= i && i <= 0x32BF
2181 || 0x3371 <= i && i <= 0x337B
2182 || 0xFA30 <= i && i <= 0xFA6A
2186 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2188 case UnicodeCategory.PrivateUse:
2189 case UnicodeCategory.Surrogate:
2191 // ignored by nature
2192 case UnicodeCategory.Format:
2193 case UnicodeCategory.OtherNotAssigned:
2200 // To check IsIgnorable sanity, try the driver below under MS.NET.
2203 public static void Main ()
2205 for (int i = 0; i <= char.MaxValue; i++)
2206 Dump (i, IsIgnorable (i));
2209 static void Dump (int i, bool ignore)
2211 switch (Char.GetUnicodeCategory ((char) i)) {
2212 case UnicodeCategory.PrivateUse:
2213 case UnicodeCategory.Surrogate:
2214 return; // check nothing
2218 string s2 = new string ((char) i, 10);
2219 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2220 if ((ret == 0) == ignore)
2222 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2225 #endregion // IsIgnorable
2227 #region IsIgnorableSymbol
2228 static bool IsIgnorableSymbol (int i)
2230 if (IsIgnorable (i))
2235 case 0x00b5: case 0x01C0: case 0x01C1:
2236 case 0x01C2: case 0x01C3: case 0x01F6:
2237 case 0x01F7: case 0x01F8: case 0x01F9:
2238 case 0x02D0: case 0x02EE: case 0x037A:
2239 case 0x03D7: case 0x03F3:
2240 case 0x0400: case 0x040d:
2241 case 0x0450: case 0x045d:
2242 case 0x048C: case 0x048D:
2243 case 0x048E: case 0x048F:
2244 case 0x0587: case 0x0640: case 0x06E5:
2245 case 0x06E6: case 0x06FA: case 0x06FB:
2246 case 0x06FC: case 0x093D: case 0x0950:
2247 case 0x1E9B: case 0x2139: case 0x3006:
2248 case 0x3033: case 0x3034: case 0x3035:
2249 case 0xFE7E: case 0xFE7F:
2251 case 0x16EE: case 0x16EF: case 0x16F0:
2253 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2254 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2255 case 0x3038: // HANGZHOU NUMERAL TEN
2256 case 0x3039: // HANGZHOU NUMERAL TWENTY
2257 case 0x303a: // HANGZHOU NUMERAL THIRTY
2263 case 0x02B9: case 0x02BA: case 0x02C2:
2264 case 0x02C3: case 0x02C4: case 0x02C5:
2265 case 0x02C8: case 0x02CC: case 0x02CD:
2266 case 0x02CE: case 0x02CF: case 0x02D2:
2267 case 0x02D3: case 0x02D4: case 0x02D5:
2268 case 0x02D6: case 0x02D7: case 0x02DE:
2269 case 0x02E5: case 0x02E6: case 0x02E7:
2270 case 0x02E8: case 0x02E9:
2271 case 0x309B: case 0x309C:
2273 case 0x055A: // American Apos
2274 case 0x05C0: // Hebrew Punct
2275 case 0x0E4F: // Thai FONGMAN
2276 case 0x0E5A: // Thai ANGKHANKHU
2277 case 0x0E5B: // Thai KHOMUT
2279 case 0x09F2: // Bengali Rupee Mark
2280 case 0x09F3: // Bengali Rupee Sign
2282 case 0x221e: // INF.
2291 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2293 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2294 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2299 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2301 case UnicodeCategory.Surrogate:
2302 return false; // inconsistent
2304 case UnicodeCategory.SpacingCombiningMark:
2305 case UnicodeCategory.EnclosingMark:
2306 case UnicodeCategory.NonSpacingMark:
2307 case UnicodeCategory.PrivateUse:
2309 if (0x064B <= i && i <= 0x0652) // Arabic
2313 case UnicodeCategory.Format:
2314 case UnicodeCategory.OtherNotAssigned:
2321 // latin in a circle
2322 0x249A <= i && i <= 0x24E9
2323 || 0x2100 <= i && i <= 0x2132
2325 || 0x3196 <= i && i <= 0x31A0
2327 || 0x3200 <= i && i <= 0x321C
2329 || 0x322A <= i && i <= 0x3243
2331 || 0x3260 <= i && i <= 0x32B0
2332 || 0x32D0 <= i && i <= 0x3357
2333 || 0x337B <= i && i <= 0x33DD
2335 use = !Char.IsLetterOrDigit ((char) i);
2339 // This "Digit" rule is mystery.
2340 // It filters some symbols out.
2341 if (Char.IsLetterOrDigit ((char) i))
2343 if (Char.IsNumber ((char) i))
2345 if (Char.IsControl ((char) i)
2346 || Char.IsSeparator ((char) i)
2347 || Char.IsPunctuation ((char) i))
2349 if (Char.IsSymbol ((char) i))
2352 // FIXME: should check more
2357 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2359 public static void Main ()
2361 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2362 for (int i = 0; i <= char.MaxValue; i++) {
2363 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2364 if (uc == UnicodeCategory.Surrogate)
2367 bool ret = IsIgnorableSymbol (i);
2369 string s1 = "TEST ";
2370 string s2 = "TEST " + (char) i;
2372 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2374 if (ret != (result == 0))
2375 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2376 ret ? "should not ignore" :
2385 static bool IsIgnorableNonSpacing (int i)
2387 if (IsIgnorable (i))
2391 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2392 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2393 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2395 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2396 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2397 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2398 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2399 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2400 case 0x0CCD: case 0x0E4E:
2404 if (0x02b9 <= i && i <= 0x02c5
2405 || 0x02cc <= i && i <= 0x02d7
2406 || 0x02e4 <= i && i <= 0x02ef
2407 || 0x20DD <= i && i <= 0x20E0
2411 if (0x064B <= i && i <= 0x00652
2412 || 0x0941 <= i && i <= 0x0948
2413 || 0x0AC1 <= i && i <= 0x0ACD
2414 || 0x0C3E <= i && i <= 0x0C4F
2415 || 0x0E31 <= i && i <= 0x0E3F
2419 return Char.GetUnicodeCategory ((char) i) ==
2420 UnicodeCategory.NonSpacingMark;
2423 // We can reuse IsIgnorableSymbol testcode
2424 // for IsIgnorableNonSpacing.
2430 public byte Category;
2432 public byte Level2; // It is always single byte.
2433 public bool Defined;
2435 public CharMapEntry (byte category, byte level1, byte level2)
2437 Category = category;
2446 public readonly int CP;
2447 public readonly int JIS;
2449 public JISCharacter (int cp, int cpJIS)
2456 class JISComparer : IComparer
2458 public static readonly JISComparer Instance =
2461 public int Compare (object o1, object o2)
2463 JISCharacter j1 = (JISCharacter) o1;
2464 JISCharacter j2 = (JISCharacter) o2;
2465 return j2.JIS - j1.JIS;
2469 class NonJISCharacter
2471 public readonly int CP;
2472 public readonly string Name;
2474 public NonJISCharacter (int cp, string name)
2481 class NonJISComparer : IComparer
2483 public static readonly NonJISComparer Instance =
2484 new NonJISComparer ();
2486 public int Compare (object o1, object o2)
2488 NonJISCharacter j1 = (NonJISCharacter) o1;
2489 NonJISCharacter j2 = (NonJISCharacter) o2;
2490 return string.CompareOrdinal (j1.Name, j2.Name);
2494 class DictionaryValueComparer : IComparer
2496 public static readonly DictionaryValueComparer Instance
2497 = new DictionaryValueComparer ();
2499 private DictionaryValueComparer ()
2503 public /*static*/ int Compare (object o1, object o2)
2505 DictionaryEntry e1 = (DictionaryEntry) o1;
2506 DictionaryEntry e2 = (DictionaryEntry) o2;
2507 // FIXME: in case of 0, compare decomposition categories
2508 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
2511 int i1 = (int) e1.Key;
2512 int i2 = (int) e2.Key;
2517 class UCAComparer : IComparer
2519 public static readonly UCAComparer Instance
2520 = new UCAComparer ();
2522 private UCAComparer ()
2526 public int Compare (object o1, object o2)
2528 char i1 = (char) o1;
2529 char i2 = (char) o2;
2531 int l1 = CollationElementTable.GetSortKeyCount (i1);
2532 int l2 = CollationElementTable.GetSortKeyCount (i2);
2533 int l = l1 > l2 ? l2 : l1;
2535 for (int i = 0; i < l; i++) {
2536 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
2537 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
2538 int v = k1.Primary - k2.Primary;
2541 v = k1.Secondary - k2.Secondary;
2544 v = k1.Thirtiary - k2.Thirtiary;
2547 v = k1.Quarternary - k2.Quarternary;