3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
34 namespace Mono.Globalization.Unicode
36 internal class MSCompatSortKeyTableGenerator
38 public static void Main (string [] args)
40 new MSCompatSortKeyTableGenerator ().Run (args);
43 const int DecompositionWide = 1; // fixed
44 const int DecompositionSub = 2; // fixed
45 const int DecompositionSmall = 3;
46 const int DecompositionIsolated = 4;
47 const int DecompositionInitial = 5;
48 const int DecompositionFinal = 6;
49 const int DecompositionMedial = 7;
50 const int DecompositionNoBreak = 8;
51 const int DecompositionVertical = 9;
52 const int DecompositionFraction = 0xA;
53 const int DecompositionFont = 0xB;
54 const int DecompositionSuper = 0xC; // fixed
55 const int DecompositionFull = 0xE;
56 const int DecompositionNarrow = 0xD;
57 const int DecompositionCircle = 0xF;
58 const int DecompositionSquare = 0x10;
59 const int DecompositionCompat = 0x11;
60 const int DecompositionCanonical = 0x12;
62 TextWriter Result = Console.Out;
64 byte [] fillIndex = new byte [256]; // by category
65 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
67 char [] specialIgnore = new char [] {
68 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
69 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
72 // FIXME: need more love (as always)
73 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
74 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
75 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
76 '\u0292', '\u01BE', '\u0298'};
77 byte [] alphaWeights = new byte [] {
78 2, 9, 0xA, 0x1A, 0x21,
79 0x23, 0x25, 0x2C, 0x32, 0x35,
80 0x36, 0x48, 0x51, 0x70, 0x7C,
81 0x7E, 0x89, 0x8A, 0x91, 0x99,
82 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
83 0xA9, 0xAA, 0xB3, 0xB4};
85 bool [] isSmallCapital = new bool [char.MaxValue + 1];
86 bool [] isUppercase = new bool [char.MaxValue + 1];
88 byte [] decompType = new byte [char.MaxValue + 1];
89 int [] decompIndex = new int [char.MaxValue + 1];
90 int [] decompLength = new int [char.MaxValue + 1];
92 decimal [] decimalValue = new decimal [char.MaxValue + 1];
94 byte [] diacritical = new byte [char.MaxValue + 1];
96 string [] diacritics = new string [] {
98 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
99 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
100 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
101 " OGONEK;", " CEDILLA;",
102 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
103 " STROKE;", " CIRCUMFLEX AND ACUTE;",
104 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
105 " DIAERESIS AND GRAVE;",
107 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
108 " MACRON AND ACUTE;",
109 " MACRON AND GRAVE;",
110 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
111 " RING ABOVE AND ACUTE",
112 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
113 " CIRCUMFLEX AND TILDE",
114 " TILDE AND DIAERESIS",
117 " CEDILLA AND BREVE",
118 " OGONEK AND MACRON",
119 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
122 " PRECEDED BY APOSTROPHE",
124 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
127 " RETROFLEX;", "DIAERESIS BELOW",
129 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
130 " BREVE BELOW;", " HORN AND GRAVE",
132 " DOT BELOW AND DOT ABOVE",
133 " RIGHT HALF RING", " HORN AND TILDE",
134 " CIRCUMFLEX AND DOT BELOW",
135 " BREVE AND DOT BELOW",
136 " DOT BELOW AND MACRON",
137 " HORN AND HOOK ABOVE",
139 // CIRCLED, PARENTHESIZED and so on
140 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
141 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
143 byte [] diacriticWeights = new byte [] {
145 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
146 0x17, 0x19, 0x1A, 0x1B, 0x1C,
147 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
148 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
149 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
150 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
151 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
152 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
153 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
154 0x69, 0x69, 0x6A, 0x6D, 0x6E,
156 // CIRCLED, PARENTHESIZED and so on.
157 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
160 int [] numberSecondaryWeightBounds = new int [] {
161 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
162 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
163 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
164 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
165 0xE50, 0xE60, 0xED0, 0xEE0
168 char [] orderedCyrillic;
169 char [] orderedGurmukhi;
170 char [] orderedGujarati;
171 char [] orderedGeorgian;
172 char [] orderedThaana;
174 static readonly char [] orderedTamilConsonants = new char [] {
175 // based on traditional Tamil consonants, except for
176 // Grantha (where Microsoft breaks traditionalism).
177 // http://www.angelfire.com/empire/thamizh/padanGaL
178 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
179 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
180 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
181 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
184 // cp -> character name (only for some characters)
185 ArrayList sortableCharNames = new ArrayList ();
187 // cp -> arrow value (int)
188 ArrayList arrowValues = new ArrayList ();
190 // cp -> box value (int)
191 ArrayList boxValues = new ArrayList ();
193 // cp -> level1 value
194 Hashtable arabicLetterPrimaryValues = new Hashtable ();
197 Hashtable arabicNameMap = new Hashtable ();
199 // cp -> Hashtable [decompType] -> cp
200 Hashtable nfkdMap = new Hashtable ();
202 // Latin letter -> ArrayList [int]
203 Hashtable latinMap = new Hashtable ();
205 ArrayList jisJapanese = new ArrayList ();
206 ArrayList nonJisJapanese = new ArrayList ();
208 ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
209 ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
210 ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
211 ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
212 byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
214 byte [] ignorableFlags = new byte [char.MaxValue + 1];
216 double [] unicodeAge = new double [char.MaxValue + 1];
218 void Run (string [] args)
220 string dirname = args.Length == 0 ? "downloaded" : args [0];
223 ParseSources (dirname);
224 Console.Error.WriteLine ("parse done.");
226 ModifyParsedValues ();
228 Console.Error.WriteLine ("generation done.");
230 Console.Error.WriteLine ("serialization done.");
236 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
237 for (int i = 0; i <= char.MaxValue; i++) {
238 byte value = ignorableFlags [i];
240 Result.Write ("{0},", value);
242 Result.Write ("0x{0:X02},", value);
243 if ((i & 0xF) == 0xF)
244 Result.WriteLine ("// {0:X04}", i - 0xF);
246 Result.WriteLine ("};");
250 Result.WriteLine ("static byte [] categories = new byte [] {");
251 for (int i = 0; i < map.Length; i++) {
252 byte value = map [i].Category;
254 Result.Write ("{0},", value);
256 Result.Write ("0x{0:X02},", value);
257 if ((i & 0xF) == 0xF)
258 Result.WriteLine ("// {0:X04}", i - 0xF);
260 Result.WriteLine ("};");
263 // Primary weight value
264 Result.WriteLine ("static byte [] level1 = new byte [] {");
265 for (int i = 0; i < map.Length; i++) {
266 byte value = map [i].Level1;
268 Result.Write ("{0},", value);
270 Result.Write ("0x{0:X02},", value);
271 if ((i & 0xF) == 0xF)
272 Result.WriteLine ("// {0:X04}", i - 0xF);
274 Result.WriteLine ("};");
278 Result.WriteLine ("static byte [] level2 = new byte [] {");
279 for (int i = 0; i < map.Length; i++) {
280 int value = map [i].Level2;
282 Result.Write ("{0},", value);
284 Result.Write ("0x{0:X02},", value);
285 if ((i & 0xF) == 0xF)
286 Result.WriteLine ("// {0:X04}", i - 0xF);
288 Result.WriteLine ("};");
292 Result.WriteLine ("static byte [] level3 = new byte [] {");
293 for (int i = 0; i < map.Length; i++) {
294 byte value = ComputeLevel3Weight ((char) i);
296 Result.Write ("{0},", value);
298 Result.Write ("0x{0:X02},", value);
299 if ((i & 0xF) == 0xF)
300 Result.WriteLine ("// {0:X04}", i - 0xF);
302 Result.WriteLine ("};");
305 // Width insensitivity mappings
306 // (for now it is more lightweight than dumping the
307 // entire NFKD table).
308 Result.WriteLine ("static int [] widthCompat = new int [] {");
309 for (int i = 0; i < char.MaxValue; i++) {
311 switch (decompType [i]) {
312 case DecompositionNarrow:
313 case DecompositionWide:
314 case DecompositionSuper:
315 case DecompositionSub:
316 // they are always 1 char
317 value = decompValues [decompIndex [i]];
321 Result.Write ("{0},", value);
323 Result.Write ("0x{0:X04},", value);
324 if ((i & 0xF) == 0xF)
325 Result.WriteLine ("// {0:X04}", i - 0xF);
327 Result.WriteLine ("};");
331 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
332 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
333 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
334 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
335 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
338 void SerializeCJK (string name, ushort [] cjk, int max)
340 int offset = char.MaxValue - cjk.Length;
341 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
342 for (int i = 0; i < cjk.Length; i++) {
343 if (i + offset == max)
345 ushort value = cjk [i];
347 Result.Write ("{0},", value);
349 Result.Write ("0x{0:X04},", value);
350 if ((i & 0xF) == 0xF)
351 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
353 Result.WriteLine ("};");
357 void SerializeCJK (string name, byte [] cjk, int max)
359 int offset = char.MaxValue - cjk.Length;
360 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
361 for (int i = 0; i < cjk.Length; i++) {
362 if (i + offset == max)
364 byte value = cjk [i];
366 Result.Write ("{0},", value);
368 Result.Write ("0x{0:X02},", value);
369 if ((i & 0xF) == 0xF)
370 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
372 Result.WriteLine ("};");
378 void ParseSources (string dirname)
381 dirname + "/UnicodeData.txt";
382 string derivedCoreProps =
383 dirname + "/DerivedCoreProperties.txt";
385 dirname + "/Scripts.txt";
387 dirname + "/CP932.TXT";
389 dirname + "/DerivedAge.txt";
390 string chXML = dirname + "/common/collation/zh.xml";
391 string jaXML = dirname + "/common/collation/ja.xml";
392 string koXML = dirname + "/common/collation/ko.xml";
394 ParseDerivedAge (derivedAge);
395 ParseJISOrder (cp932); // in prior to ParseUnidata()
396 ParseUnidata (unidata);
397 ParseDerivedCoreProperties (derivedCoreProps);
398 ParseScripts (scripts);
399 ParseCJK (chXML, jaXML, koXML);
402 void ParseDerivedAge (string filename)
404 using (StreamReader file =
405 new StreamReader (filename)) {
406 while (file.Peek () >= 0) {
407 string s = file.ReadLine ();
408 int idx = s.IndexOf ('#');
410 s = s.Substring (0, idx);
411 idx = s.IndexOf (';');
415 string cpspec = s.Substring (0, idx);
416 idx = cpspec.IndexOf ("..");
417 NumberStyles nf = NumberStyles.HexNumber |
418 NumberStyles.AllowTrailingWhite;
419 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
420 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
421 string value = s.Substring (cpspec.Length + 1).Trim ();
424 if (cp > char.MaxValue)
427 for (int i = cp; i <= cpEnd; i++)
428 unicodeAge [i] = double.Parse (value);
433 void ParseUnidata (string filename)
435 ArrayList decompValues = new ArrayList ();
436 using (StreamReader unidata =
437 new StreamReader (filename)) {
438 for (int line = 1; unidata.Peek () >= 0; line++) {
440 ProcessUnidataLine (unidata.ReadLine (), decompValues);
441 } catch (Exception) {
442 Console.Error.WriteLine ("**** At line " + line);
447 this.decompValues = (int [])
448 decompValues.ToArray (typeof (int));
451 void ProcessUnidataLine (string s, ArrayList decompValues)
453 int idx = s.IndexOf ('#');
455 s = s.Substring (0, idx);
456 idx = s.IndexOf (';');
459 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
460 string [] values = s.Substring (idx + 1).Split (';');
463 if (cp > char.MaxValue)
465 if (IsIgnorable (cp))
468 string name = values [0];
471 if (s.IndexOf ("SMALL CAPITAL") > 0)
472 isSmallCapital [cp] = true;
474 // latin mapping by character name
475 if (s.IndexOf ("LATIN") > 0) {
476 int lidx = s.IndexOf ("LETTER DOTLESS ");
477 int offset = lidx + 15;
479 lidx = s.IndexOf ("LETTER TURNED ");
483 lidx = s.IndexOf ("LETTER ");
486 char c = lidx > 0 ? s [offset] : char.MinValue;
487 if ('A' <= c && c <= 'Z' &&
488 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
489 ArrayList entry = (ArrayList) latinMap [c];
491 entry = new ArrayList ();
492 latinMap [c] = entry;
499 if (0x2000 <= cp && cp < 0x3000) {
501 // SPECIAL CASES. FIXME: why?
503 case 0x21C5: value = -1; break; // E2
504 case 0x261D: value = 1; break;
505 case 0x27A6: value = 3; break;
506 case 0x21B0: value = 7; break;
507 case 0x21B1: value = 3; break;
508 case 0x21B2: value = 7; break;
509 case 0x21B4: value = 5; break;
510 case 0x21B5: value = 7; break;
511 case 0x21B9: value = -1; break; // E1
512 case 0x21CF: value = 7; break;
513 case 0x21D0: value = 3; break;
515 string [] arrowTargets = new string [] {
527 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
528 if (s.IndexOf (arrowTargets [i]) > 0 &&
529 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
530 s.IndexOf (" OVER") < 0
534 arrowValues.Add (new DictionaryEntry (
539 if (0x2500 <= cp && cp < 0x25B0) {
542 // up:1 down:2 right:4 left:8 vert:16 horiz:32
545 // [dr] [dl] [ur] [ul]
549 ArrayList flags = new ArrayList (new int [] {
552 4 + 2, 8 + 2, 4 + 1, 8 + 1,
553 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
554 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
555 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
557 byte [] offsets = new byte [] {
564 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
566 if (s.IndexOf (" UP") > 0)
568 if (s.IndexOf (" DOWN") > 0)
570 if (s.IndexOf (" RIGHT") > 0)
572 if (s.IndexOf (" LEFT") > 0)
574 if (s.IndexOf (" VERTICAL") > 0)
576 if (s.IndexOf (" HORIZONTAL") > 0)
579 int fidx = flags.IndexOf (flag);
580 value = fidx < 0 ? fidx : offsets [fidx];
581 } else if (s.IndexOf ("BLOCK") > 0) {
582 if (s.IndexOf ("ONE EIGHTH") > 0)
584 else if (s.IndexOf ("ONE QUARTER") > 0)
586 else if (s.IndexOf ("THREE EIGHTHS") > 0)
588 else if (s.IndexOf ("HALF") > 0)
590 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
592 else if (s.IndexOf ("THREE QUARTERS") > 0)
594 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
600 boxValues.Add (new DictionaryEntry (
604 // For some characters store the name and sort later
605 // to determine sorting.
606 if (0x2100 <= cp && cp <= 0x213F &&
607 Char.IsSymbol ((char) cp))
608 sortableCharNames.Add (
609 new DictionaryEntry (cp, values [0]));
610 else if (0x3380 <= cp && cp <= 0x33DD)
611 sortableCharNames.Add (new DictionaryEntry (
612 cp, values [0].Substring (7)));
614 // diacritical weights by character name
615 for (int d = 0; d < diacritics.Length; d++)
616 if (s.IndexOf (diacritics [d]) > 0)
617 diacritical [cp] |= diacriticWeights [d];
618 // Two-step grep required for it.
619 if (s.IndexOf ("FULL STOP") > 0 &&
620 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
621 diacritical [cp] |= 0xF4;
623 // Arabic letter name
624 if (0x0621 <= cp && cp <= 0x064A &&
625 Char.GetUnicodeCategory ((char) cp)
626 == UnicodeCategory.OtherLetter) {
627 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
632 // hamza, waw, yeh ... special cases.
637 value = 0x77; // special cases.
640 // Get primary letter name i.e.
641 // XXX part of ARABIC LETTER XXX yyy
642 // e.g. that of "TEH MARBUTA" is "TEH".
645 // 0x0640 is special: it does
646 // not start with ARABIC LETTER
648 values [0].Substring (14);
649 int tmpIdx = letterName.IndexOf (' ');
650 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
651 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
652 if (arabicNameMap.ContainsKey (letterName))
653 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
655 arabicNameMap [letterName] = cp;
658 arabicLetterPrimaryValues [cp] = value;
661 // Japanese square letter
662 if (0x3300 <= cp && cp <= 0x3357)
664 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
667 string decomp = values [4];
668 idx = decomp.IndexOf ('<');
670 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
672 decompType [cp] = DecompositionFull;
675 decompType [cp] = DecompositionSub;
678 decompType [cp] = DecompositionSuper;
681 decompType [cp] = DecompositionSmall;
684 decompType [cp] = DecompositionIsolated;
687 decompType [cp] = DecompositionInitial;
690 decompType [cp] = DecompositionFinal;
693 decompType [cp] = DecompositionMedial;
696 decompType [cp] = DecompositionNoBreak;
699 decompType [cp] = DecompositionCompat;
702 decompType [cp] = DecompositionFraction;
705 decompType [cp] = DecompositionFont;
708 decompType [cp] = DecompositionCircle;
711 decompType [cp] = DecompositionSquare;
714 decompType [cp] = DecompositionWide;
717 decompType [cp] = DecompositionNarrow;
720 decompType [cp] = DecompositionVertical;
723 throw new Exception ("Support NFKD type : " + decomp);
727 decompType [cp] = DecompositionCanonical;
728 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
729 if (decomp.Length > 0) {
731 string [] velems = decomp.Split (' ');
732 int didx = decompValues.Count;
733 decompIndex [cp] = didx;
734 foreach (string v in velems)
735 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
736 decompLength [cp] = velems.Length;
738 // [decmpType] -> this_cp
739 int targetCP = (int) decompValues [didx];
740 // for "(x)" it specially maps to 'x' .
741 // FIXME: check if it is sane
742 if (velems.Length == 3 &&
743 (int) decompValues [didx] == '(' &&
744 (int) decompValues [didx + 2] == ')')
745 targetCP = (int) decompValues [didx + 1];
746 // special: 0x215F "1/"
747 else if (cp == 0x215F)
749 else if (velems.Length > 1 &&
750 (targetCP < 0x4C00 || 0x9FBB < targetCP))
751 // skip them, except for CJK ideograph compat
755 Hashtable entry = (Hashtable) nfkdMap [targetCP];
757 entry = new Hashtable ();
758 nfkdMap [targetCP] = entry;
760 entry [(byte) decompType [cp]] = cp;
764 if (values [5].Length > 0)
765 decimalValue [cp] = decimal.Parse (values [5]);
766 else if (values [6].Length > 0)
767 decimalValue [cp] = decimal.Parse (values [6]);
768 else if (values [7].Length > 0) {
769 string decstr = values [7];
770 idx = decstr.IndexOf ('/');
771 if (cp == 0x215F) // special. "1/"
772 decimalValue [cp] = 0x1;
776 decimal.Parse (decstr.Substring (0, idx))
777 / decimal.Parse (decstr.Substring (idx + 1));
778 else if (decstr [0] == '(' &&
779 decstr [decstr.Length - 1] == ')')
782 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
783 else if (decstr [decstr.Length - 1] == '.')
786 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
788 decimalValue [cp] = decimal.Parse (decstr);
792 void ParseDerivedCoreProperties (string filename)
795 using (StreamReader file =
796 new StreamReader (filename)) {
797 for (int line = 1; file.Peek () >= 0; line++) {
799 ProcessDerivedCorePropLine (file.ReadLine ());
800 } catch (Exception) {
801 Console.Error.WriteLine ("**** At line " + line);
808 void ProcessDerivedCorePropLine (string s)
810 int idx = s.IndexOf ('#');
812 s = s.Substring (0, idx);
813 idx = s.IndexOf (';');
816 string cpspec = s.Substring (0, idx);
817 idx = cpspec.IndexOf ("..");
818 NumberStyles nf = NumberStyles.HexNumber |
819 NumberStyles.AllowTrailingWhite;
820 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
821 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
822 string value = s.Substring (cpspec.Length + 1).Trim ();
825 if (cp > char.MaxValue)
830 for (int x = cp; x <= cpEnd; x++)
831 isUppercase [x] = true;
836 void ParseScripts (string filename)
838 ArrayList cyrillic = new ArrayList ();
839 ArrayList gurmukhi = new ArrayList ();
840 ArrayList gujarati = new ArrayList ();
841 ArrayList georgian = new ArrayList ();
842 ArrayList thaana = new ArrayList ();
844 using (StreamReader file =
845 new StreamReader (filename)) {
846 while (file.Peek () >= 0) {
847 string s = file.ReadLine ();
848 int idx = s.IndexOf ('#');
850 s = s.Substring (0, idx);
851 idx = s.IndexOf (';');
855 string cpspec = s.Substring (0, idx);
856 idx = cpspec.IndexOf ("..");
857 NumberStyles nf = NumberStyles.HexNumber |
858 NumberStyles.AllowTrailingWhite;
859 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
860 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
861 string value = s.Substring (cpspec.Length + 1).Trim ();
864 if (cp > char.MaxValue)
869 for (int x = cp; x <= cpEnd; x++)
870 if (!IsIgnorable (x))
871 cyrillic.Add ((char) x);
874 for (int x = cp; x <= cpEnd; x++)
875 if (!IsIgnorable (x))
876 gurmukhi.Add ((char) x);
879 for (int x = cp; x <= cpEnd; x++)
880 if (!IsIgnorable (x))
881 gujarati.Add ((char) x);
884 for (int x = cp; x <= cpEnd; x++)
885 if (!IsIgnorable (x))
886 georgian.Add ((char) x);
889 for (int x = cp; x <= cpEnd; x++)
890 if (!IsIgnorable (x))
891 thaana.Add ((char) x);
896 cyrillic.Sort (UCAComparer.Instance);
897 gurmukhi.Sort (UCAComparer.Instance);
898 gujarati.Sort (UCAComparer.Instance);
899 georgian.Sort (UCAComparer.Instance);
900 thaana.Sort (UCAComparer.Instance);
901 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
902 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
903 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
904 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
905 orderedThaana = (char []) thaana.ToArray (typeof (char));
908 void ParseJISOrder (string filename)
910 using (StreamReader file =
911 new StreamReader (filename)) {
912 while (file.Peek () >= 0) {
913 string s = file.ReadLine ();
914 int idx = s.IndexOf ('#');
916 s = s.Substring (0, idx).Trim ();
919 idx = s.IndexOf (' ');
922 // They start with "0x" so cut them out.
923 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
924 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
925 jisJapanese.Add (new JISCharacter (cp, jis));
930 void ParseCJK (string zhXML, string jaXML, string koXML)
932 XmlDocument doc = new XmlDocument ();
933 doc.XmlResolver = null;
940 // Chinese Simplified
943 offset = char.MaxValue - arr.Length;
945 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
947 foreach (char c in s) {
949 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
951 arr [(int) c - offset] = (ushort) v++;
957 // Chinese Traditional
960 offset = char.MaxValue - arr.Length;
961 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
963 foreach (char c in s) {
965 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
967 arr [(int) c - offset] = (ushort) v++;
976 offset = char.MaxValue - arr.Length;
978 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
980 foreach (char c in s) {
982 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
984 arr [(int) c - offset] = (ushort) v++;
991 // Korean weight is somewhat complex. It first shifts
992 // Hangul category from 52-x to 80-x (they are anyways
993 // computed). CJK ideographs are placed at secondary
994 // weight, like XX YY 01 zz 01, where XX and YY are
995 // corresponding "reset" value and zz is 41,43,45...
997 // Unlike chs,cht and ja, Korean value is a combined
998 // ushort which is computed as category
1002 offset = char.MaxValue - arr.Length;
1004 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1005 XmlElement sc = (XmlElement) reset.NextSibling;
1006 // compute "category" and "level 1" for the
1007 // target "reset" Hangle syllable
1008 char rc = reset.InnerText [0];
1009 int ri = ((int) rc - 0xAC00) + 1;
1011 ((ri / 254) * 256 + (ri % 254) + 2);
1012 // Place the characters after the target.
1015 foreach (char c in s) {
1016 arr [(int) c - offset] = p;
1017 cjkKOlv2 [(int) c - offset] = (byte) v;
1027 void FillIgnorables ()
1029 for (int i = 0; i <= char.MaxValue; i++) {
1030 if (Char.GetUnicodeCategory ((char) i) ==
1031 UnicodeCategory.OtherNotAssigned)
1033 if (IsIgnorable (i))
1034 ignorableFlags [i] |= 1;
1035 if (IsIgnorableSymbol (i))
1036 ignorableFlags [i] |= 2;
1037 if (IsIgnorableNonSpacing (i))
1038 ignorableFlags [i] |= 4;
1042 void ModifyParsedValues ()
1044 // number, secondary weights
1046 int [] numarr = numberSecondaryWeightBounds;
1047 for (int i = 0; i < numarr.Length; i += 2, weight++)
1048 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1049 if (Char.IsNumber ((char) cp))
1050 diacritical [cp] = weight;
1052 // Korean parens numbers
1053 for (int i = 0x3200; i <= 0x321C; i++)
1054 diacritical [i] = 0xA;
1055 for (int i = 0x3260; i <= 0x327B; i++)
1056 diacritical [i] = 0xC;
1058 // Update name part of named characters
1059 for (int i = 0; i < sortableCharNames.Count; i++) {
1060 DictionaryEntry de =
1061 (DictionaryEntry) sortableCharNames [i];
1062 int cp = (int) de.Key;
1063 string renamed = null;
1065 case 0x2101: renamed = "A_1"; break;
1066 case 0x33C3: renamed = "A_2"; break;
1067 case 0x2105: renamed = "C_1"; break;
1068 case 0x2106: renamed = "C_2"; break;
1069 case 0x211E: renamed = "R1"; break;
1070 case 0x211F: renamed = "R2"; break;
1071 // Remove some of them!
1082 sortableCharNames.RemoveAt (i);
1086 if (renamed != null)
1087 sortableCharNames [i] =
1088 new DictionaryEntry (cp, renamed);
1092 void GenerateCore ()
1096 #region Specially ignored // 01
1097 // This will raise "Defined" flag up.
1098 foreach (char c in specialIgnore)
1099 map [(int) c] = new CharMapEntry (0, 0, 0);
1103 #region Variable weights
1104 // Controls : 06 03 - 06 3D
1106 for (int i = 0; i < 65536; i++) {
1107 if (IsIgnorable (i))
1110 uc = Char.GetUnicodeCategory (c);
1111 // NEL is whitespace but not ignored here.
1112 if (uc == UnicodeCategory.Control &&
1113 !Char.IsWhiteSpace (c) || c == '\u0085')
1114 AddCharMap (c, 6, 1);
1118 fillIndex [6] = 0x80;
1119 AddCharMapGroup ('\'', 6, 1, 0);
1120 AddCharMap ('\uFE63', 6, 1);
1122 // Hyphen/Dash : 06 81 - 06 90
1123 for (int i = 0; i < char.MaxValue; i++) {
1124 if (Char.GetUnicodeCategory ((char) i)
1125 == UnicodeCategory.DashPunctuation)
1126 AddCharMapGroupTail ((char) i, 6, 1);
1129 // Arabic variable weight chars 06 A0 -
1130 fillIndex [6] = 0xA0;
1132 for (int i = 0x64B; i <= 0x650; i++)
1133 AddCharMapGroupTail ((char) i, 6, 1);
1135 AddCharMapGroup ('\u0652', 6, 1, 0);
1137 AddCharMapGroup ('\u0651', 6, 1, 0);
1141 #region Nonspacing marks // 01
1142 // FIXME: 01 03 - 01 B6 ... annoyance :(
1144 // Combining diacritical marks: 01 DC -
1146 fillIndex [0x1] = 0x41;
1147 for (int i = 0x030E; i <= 0x0326; i++)
1148 if (!IsIgnorable (i))
1149 AddCharMap ((char) i, 0x1, 1);
1150 for (int i = 0x0329; i <= 0x0334; i++)
1151 if (!IsIgnorable (i))
1152 AddCharMap ((char) i, 0x1, 1);
1153 for (int i = 0x0339; i <= 0x0341; i++)
1154 if (!IsIgnorable (i))
1155 AddCharMap ((char) i, 0x1, 1);
1156 fillIndex [0x1] = 0x72;
1157 for (int i = 0x0346; i <= 0x0348; i++)
1158 if (!IsIgnorable (i))
1159 AddCharMap ((char) i, 0x1, 1);
1160 for (int i = 0x02BE; i <= 0x02BF; i++)
1161 if (!IsIgnorable (i))
1162 AddCharMap ((char) i, 0x1, 1);
1163 for (int i = 0x02C1; i <= 0x02C5; i++)
1164 if (!IsIgnorable (i))
1165 AddCharMap ((char) i, 0x1, 1);
1166 for (int i = 0x02CE; i <= 0x02CF; i++)
1167 if (!IsIgnorable (i))
1168 AddCharMap ((char) i, 0x1, 1);
1169 for (int i = 0x02D1; i <= 0x02D3; i++)
1170 if (!IsIgnorable (i))
1171 AddCharMap ((char) i, 0x1, 1);
1172 AddCharMap ('\u02DE', 0x1, 1);
1173 for (int i = 0x02E4; i <= 0x02E9; i++)
1174 if (!IsIgnorable (i))
1175 AddCharMap ((char) i, 0x1, 1);
1177 // LAMESPEC: It should not stop at '\u20E1'. There are
1178 // a few more characters (that however results in
1179 // overflow of level 2 unless we start before 0xDD).
1180 fillIndex [0x1] = 0xDC;
1181 for (int i = 0x20d0; i <= 0x20e1; i++)
1182 AddCharMap ((char) i, 0x1, 1);
1186 #region Whitespaces // 07 03 -
1187 fillIndex [0x7] = 0x2;
1188 AddCharMap (' ', 0x7, 2);
1189 AddCharMap ('\u00A0', 0x7, 1);
1190 for (int i = 9; i <= 0xD; i++)
1191 AddCharMap ((char) i, 0x7, 1);
1192 for (int i = 0x2000; i <= 0x200B; i++)
1193 AddCharMap ((char) i, 0x7, 1);
1195 fillIndex [0x7] = 0x17;
1196 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1197 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1199 // Characters which used to represent layout control.
1200 // LAMESPEC: Windows developers seem to have thought
1201 // that those characters are kind of whitespaces,
1202 // while they aren't.
1203 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1204 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1207 // FIXME: 09 should be more complete.
1208 fillIndex [0x9] = 2;
1210 for (int cp = 0x2300; cp <= 0x237A; cp++)
1211 AddCharMap ((char) cp, 0x9, 1, 0);
1214 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1215 foreach (DictionaryEntry de in arrowValues) {
1216 int idx = (int) de.Value;
1217 int cp = (int) de.Key;
1218 if (map [cp].Defined)
1220 fillIndex [0x9] = (byte) (0xD8 + idx);
1221 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1225 byte [] boxLv2 = new byte [128];
1226 for (int i = 0; i < boxLv2.Length; i++)
1228 foreach (DictionaryEntry de in boxValues) {
1229 int cp = (int) de.Key;
1230 int idx = (int) de.Value;
1231 if (map [cp].Defined)
1233 fillIndex [0x9] = (byte) (0xE5 + idx);
1234 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1237 // Some special characters (slanted)
1238 fillIndex [0x9] = 0xF4;
1239 AddCharMap ('\u2571', 0x9, 3);
1240 AddCharMap ('\u2572', 0x9, 3);
1241 AddCharMap ('\u2573', 0x9, 3);
1243 // FIXME: 08 should be more complete.
1244 fillIndex [0x8] = 2;
1245 for (int cp = 0; cp < char.MaxValue; cp++)
1246 if (!map [cp].Defined &&
1247 Char.GetUnicodeCategory ((char) cp) ==
1248 UnicodeCategory.MathSymbol)
1249 AddCharMapGroup ((char) cp, 0x8, 1, 0);
1251 // FIXME: implement 0A
1253 fillIndex [0xA] = 2;
1254 // byte currency symbols
1255 for (int cp = 0; cp < 0x100; cp++) {
1256 uc = Char.GetUnicodeCategory ((char) cp);
1257 if (!IsIgnorable (cp) &&
1258 uc == UnicodeCategory.CurrencySymbol &&
1260 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1262 // byte other symbols
1263 for (int cp = 0; cp < 0x100; cp++) {
1264 uc = Char.GetUnicodeCategory ((char) cp);
1265 if (!IsIgnorable (cp) &&
1266 uc == UnicodeCategory.OtherSymbol)
1267 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1270 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1271 for (int cp = 0x2600; cp <= 0x2613; cp++)
1272 AddCharMap ((char) cp, 0xA, 1, 0);
1273 for (int cp = 0x2620; cp <= 0x2770; cp++)
1274 if (Char.IsSymbol ((char) cp))
1275 AddCharMap ((char) cp, 0xA, 1, 0);
1279 #region Numbers // 0C 02 - 0C E1
1280 fillIndex [0xC] = 2;
1282 // 9F8 : Bengali "one less than the denominator"
1283 AddCharMap ('\u09F8', 0xC, 1);
1285 ArrayList numbers = new ArrayList ();
1286 for (int i = 0; i < 65536; i++)
1287 if (!IsIgnorable (i) &&
1288 Char.IsNumber ((char) i) &&
1289 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1292 ArrayList numberValues = new ArrayList ();
1293 foreach (int i in numbers)
1294 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1295 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1297 //foreach (DictionaryEntry de in numberValues)
1298 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1300 decimal prevValue = -1;
1301 foreach (DictionaryEntry de in numberValues) {
1302 int cp = (int) de.Key;
1303 decimal currValue = (decimal) de.Value;
1304 bool addnew = false;
1305 if (prevValue < currValue &&
1306 prevValue - (int) prevValue == 0 &&
1310 // Process Hangzhou and Roman numbers
1312 // There are some SPECIAL cases.
1313 if (currValue != 4) // no increment for 4
1317 xcp = (int) prevValue + 0x2170 - 1;
1318 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1319 xcp = (int) prevValue + 0x2160 - 1;
1320 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1321 fillIndex [0xC] += 2;
1322 xcp = (int) prevValue + 0x3021 - 1;
1323 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1326 if (prevValue < currValue)
1327 prevValue = currValue;
1328 if (map [cp].Defined)
1330 // HangZhou and Roman are add later
1332 else if (0x3021 <= cp && cp < 0x302A
1333 || 0x2160 <= cp && cp < 0x216A
1334 || 0x2170 <= cp && cp < 0x217A)
1337 if (cp == 0x215B) // FIXME: why?
1338 fillIndex [0xC] += 2;
1339 else if (cp == 0x3021) // FIXME: why?
1341 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1343 if (addnew || cp <= '9') {
1345 if (1 <= currValue && currValue <= 10) {
1346 xcp = cp - 0x31 + 0x2776;
1347 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1348 xcp = cp - 0x31 + 0x2780;
1349 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1350 xcp = cp - 0x31 + 0x278A;
1351 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1353 if (1 <= currValue && currValue <= 20) {
1354 xcp = cp - 0x31 + 0x2460;
1355 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1356 xcp = cp - 0x31 + 0x2474;
1357 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1358 xcp = cp - 0x31 + 0x2488;
1359 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1363 if (cp != 0x09E7 && cp != 0x09EA)
1366 // Add special cases that are not regarded as
1367 // numbers in UnicodeCategory speak.
1370 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1371 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1373 else if (cp == '6') // FIXME: why?
1378 fillIndex [0xC] = 0xFF;
1379 AddCharMap ('\u221E', 0xC, 1);
1382 #region Letters and NonSpacing Marks (general)
1384 // ASCII Latin alphabets
1385 for (int i = 0; i < alphabets.Length; i++)
1386 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1389 // non-ASCII Latin alphabets
1390 // FIXME: there is no such characters that are placed
1391 // *after* "alphabets" array items. This is nothing
1392 // more than a hack that creates dummy weight for
1393 // primary characters.
1394 for (int i = 0x0080; i < 0x0300; i++) {
1395 if (!Char.IsLetter ((char) i))
1397 // For those Latin Letters which has NFKD are
1398 // not added as independent primary character.
1399 if (decompIndex [i] != 0)
1402 // 1.some alphabets have primarily
1403 // equivalent ASCII alphabets.
1404 // 2.some have independent primary weights,
1405 // but inside a-to-z range.
1406 // 3.there are some expanded characters that
1407 // are not part of Unicode Standard NFKD.
1409 // 1. skipping them does not make sense
1410 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1411 // case 0x184: case 0x185: case 0x186: case 0x189:
1412 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1413 // case 0x194: case 0x195: case 0x196: case 0x19A:
1414 // case 0x19B: case 0x19C:
1415 // 2. skipping them does not make sense
1416 // case 0x14A: // Ng
1417 // case 0x14B: // ng
1421 case 0xDE: // Icelandic Thorn
1422 case 0xFE: // Icelandic Thorn
1423 case 0xDF: // German ss
1424 case 0xFF: // German ss
1425 // not classified yet
1426 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1427 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1428 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1429 // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1433 AddCharMapGroup ((char) i, 0xE, 1, 0);
1437 fillIndex [0xF] = 02;
1438 for (int i = 0x0380; i < 0x0390; i++)
1439 if (Char.IsLetter ((char) i))
1440 AddLetterMap ((char) i, 0xF, 1);
1441 fillIndex [0xF] = 02;
1442 for (int i = 0x0391; i < 0x03CF; i++)
1443 if (Char.IsLetter ((char) i))
1444 AddLetterMap ((char) i, 0xF, 1);
1445 fillIndex [0xF] = 0x40;
1446 for (int i = 0x03D0; i < 0x0400; i++)
1447 if (Char.IsLetter ((char) i))
1448 AddLetterMap ((char) i, 0xF, 1);
1450 // Cyrillic - UCA order w/ some modification
1451 fillIndex [0x10] = 0x3;
1452 // table which is moslty from UCA DUCET.
1453 for (int i = 0; i < orderedCyrillic.Length; i++) {
1454 char c = orderedCyrillic [i];
1455 if (Char.IsLetter (c))
1456 AddLetterMap (c, 0x10, 3);
1458 for (int i = 0x0460; i < 0x0481; i++) {
1459 if (Char.IsLetter ((char) i))
1460 AddLetterMap ((char) i, 0x10, 3);
1464 fillIndex [0x11] = 0x3;
1465 for (int i = 0x0531; i < 0x0586; i++)
1466 if (Char.IsLetter ((char) i))
1467 AddLetterMap ((char) i, 0x11, 1);
1471 fillIndex [0x12] = 0x3;
1472 for (int i = 0x05D0; i < 0x05FF; i++)
1473 if (Char.IsLetter ((char) i))
1474 AddLetterMap ((char) i, 0x12, 1);
1476 fillIndex [0x1] = 0x3;
1477 for (int i = 0x0591; i <= 0x05C2; i++)
1479 AddCharMap ((char) i, 0x1, 1);
1482 fillIndex [0x1] = 0x8E;
1483 fillIndex [0x13] = 0x3;
1484 for (int i = 0x0621; i <= 0x064A; i++) {
1486 if (Char.GetUnicodeCategory ((char) i)
1487 != UnicodeCategory.OtherLetter) {
1488 // FIXME: arabic nonspacing marks are
1489 // in different order.
1490 AddCharMap ((char) i, 0x1, 1);
1493 // map [i] = new CharMapEntry (0x13,
1494 // (byte) arabicLetterPrimaryValues [i], 1);
1496 (byte) arabicLetterPrimaryValues [i];
1497 AddLetterMap ((char) i, 0x13, 0);
1499 fillIndex [0x13] = 0x84;
1500 for (int i = 0x0674; i < 0x06D6; i++)
1501 if (Char.IsLetter ((char) i))
1502 AddLetterMap ((char) i, 0x13, 1);
1505 // FIXME: it does seem straight codepoint mapping.
1506 fillIndex [0x14] = 04;
1507 for (int i = 0x0901; i < 0x0905; i++)
1508 if (!IsIgnorable (i))
1509 AddLetterMap ((char) i, 0x14, 2);
1510 fillIndex [0x14] = 0xB;
1511 for (int i = 0x0905; i < 0x093A; i++)
1512 if (Char.IsLetter ((char) i))
1513 AddLetterMap ((char) i, 0x14, 4);
1514 for (int i = 0x093E; i < 0x094F; i++)
1515 if (!IsIgnorable (i))
1516 AddLetterMap ((char) i, 0x14, 2);
1520 fillIndex [0x15] = 02;
1521 for (int i = 0x0980; i < 0x9FF; i++) {
1522 if (IsIgnorable (i))
1525 fillIndex [0x15] = 0x3B;
1526 switch (Char.GetUnicodeCategory ((char) i)) {
1527 case UnicodeCategory.NonSpacingMark:
1528 case UnicodeCategory.DecimalDigitNumber:
1529 case UnicodeCategory.OtherNumber:
1532 AddLetterMap ((char) i, 0x15, 1);
1535 fillIndex [0x1] = 0x3;
1536 for (int i = 0x0981; i < 0x0A00; i++)
1537 if (Char.GetUnicodeCategory ((char) i) ==
1538 UnicodeCategory.NonSpacingMark)
1539 AddCharMap ((char) i, 0x1, 1);
1541 // Gurmukhi. orderedGurmukhi is from UCA
1542 // FIXME: it does not look equivalent to UCA.
1543 fillIndex [0x1] = 03;
1544 fillIndex [0x16] = 02;
1545 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1546 char c = orderedGurmukhi [i];
1547 if (IsIgnorable ((int) c))
1549 if (!Char.IsLetter (c)) {
1550 AddLetterMap (c, 0x1, 1);
1553 if (c == '\u0A3C' || c == '\u0A4D' ||
1554 '\u0A66' <= c && c <= '\u0A71')
1556 AddLetterMap (c, 0x16, 4);
1559 // Gujarati. orderedGujarati is from UCA
1560 fillIndex [0x17] = 02;
1561 for (int i = 0; i < orderedGujarati.Length; i++)
1562 AddLetterMap (orderedGujarati [i], 0x17, 4);
1565 fillIndex [0x18] = 02;
1566 for (int i = 0x0B00; i < 0x0B7F; i++) {
1567 switch (Char.GetUnicodeCategory ((char) i)) {
1568 case UnicodeCategory.NonSpacingMark:
1569 case UnicodeCategory.DecimalDigitNumber:
1572 AddLetterMap ((char) i, 0x18, 1);
1576 fillIndex [0x19] = 2;
1577 AddCharMap ('\u0BD7', 0x19, 0);
1578 fillIndex [0x19] = 0xA;
1580 for (int i = 0x0BD7; i < 0x0B94; i++)
1581 if (Char.IsLetter ((char) i))
1582 AddCharMap ((char) i, 0x19, 2);
1584 fillIndex [0x19] = 0x24;
1585 AddCharMap ('\u0B94', 0x19, 0);
1586 fillIndex [0x19] = 0x26;
1587 // The array for Tamil consonants is a constant.
1588 // Windows have almost similar sequence to TAM from
1589 // tamilnet but a bit different in Grantha.
1590 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1591 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1593 fillIndex [0x19] = 0x82;
1594 for (int i = 0x0BBE; i < 0x0BCD; i++)
1595 if (Char.GetUnicodeCategory ((char) i) ==
1596 UnicodeCategory.SpacingCombiningMark
1598 AddLetterMap ((char) i, 0x19, 2);
1601 fillIndex [0x1A] = 0x4;
1602 for (int i = 0x0C00; i < 0x0C62; i++) {
1603 if (i == 0x0C55 || i == 0x0C56)
1605 AddCharMap ((char) i, 0x1A, 3);
1606 char supp = (i == 0x0C0B) ? '\u0C60':
1607 i == 0x0C0C ? '\u0C61' : char.MinValue;
1608 if (supp == char.MinValue)
1610 AddCharMap (supp, 0x1A, 3);
1614 fillIndex [0x1B] = 4;
1615 for (int i = 0x0C80; i < 0x0CE5; i++) {
1616 if (i == 0x0CD5 || i == 0x0CD6)
1618 AddCharMap ((char) i, 0x1B, 3);
1622 fillIndex [0x1C] = 2;
1623 for (int i = 0x0D02; i < 0x0D61; i++)
1624 // FIXME: I avoided MSCompatUnicodeTable usage
1625 // here (it results in recursion). So check if
1626 // using NonSpacingMark makes sense or not.
1627 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1628 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1629 AddCharMap ((char) i, 0x1C, 1);
1631 // Thai ... note that it breaks 0x1E wall after E2B!
1632 // Also, all Thai characters have level 2 value 3.
1633 fillIndex [0x1E] = 2;
1634 for (int i = 0xE44; i < 0xE48; i++)
1635 AddCharMap ((char) i, 0x1E, 1, 3);
1636 for (int i = 0xE01; i < 0xE2B; i++)
1637 AddCharMap ((char) i, 0x1E, 6, 0);
1638 fillIndex [0x1F] = 5;
1639 for (int i = 0xE2B; i < 0xE30; i++)
1640 AddCharMap ((char) i, 0x1F, 6, 0);
1641 for (int i = 0xE30; i < 0xE3B; i++)
1642 AddCharMap ((char) i, 0x1F, 1, 3);
1643 // some Thai characters remains.
1644 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1645 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1646 foreach (char c in specialThai)
1647 AddCharMap (c, 0x1F, 1);
1650 fillIndex [0x1F] = 2;
1651 for (int i = 0xE80; i < 0xEDF; i++)
1652 if (Char.IsLetter ((char) i))
1653 AddCharMap ((char) i, 0x1F, 1);
1655 // Georgian. orderedGeorgian is from UCA DUCET.
1656 fillIndex [0x21] = 5;
1657 for (int i = 0; i < orderedGeorgian.Length; i++)
1658 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1661 fillIndex [0x22] = 2;
1662 int kanaOffset = 0x3041;
1663 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1665 for (int gyo = 0; gyo < 9; gyo++) {
1666 for (int dan = 0; dan < 5; dan++) {
1667 if (gyo == 7 && dan % 2 == 1) {
1670 kanaOffset -= 2; // There is no space for yi and ye.
1673 int cp = kanaOffset + dan * kanaLines [gyo];
1674 // small lines (a-gyo, ya-gyo)
1675 if (gyo == 0 || gyo == 7) {
1676 AddKanaMap (cp, 1); // small
1677 AddKanaMap (cp + 1, 1);
1680 AddKanaMap (cp, kanaLines [gyo]);
1684 // add small 'Tsu' (before normal one)
1685 AddKanaMap (0x3063, 1);
1689 fillIndex [0x22] += 3;
1690 kanaOffset += 5 * kanaLines [gyo];
1693 // Wa-gyo is almost special, so I just manually add.
1694 AddLetterMap ((char) 0x308E, 0x22, 0);
1695 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1696 AddLetterMap ((char) 0x308F, 0x22, 0);
1697 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1699 AddLetterMap ((char) 0x3090, 0x22, 0);
1700 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1701 fillIndex [0x22] += 2;
1702 // no "Wu" in Japanese.
1703 AddLetterMap ((char) 0x3091, 0x22, 0);
1704 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1706 AddLetterMap ((char) 0x3092, 0x22, 0);
1707 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1709 fillIndex [0x22] = 0x80;
1710 AddLetterMap ((char) 0x3093, 0x22, 0);
1711 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1713 // JIS Japanese square chars.
1714 fillIndex [0x22] = 0x97;
1715 jisJapanese.Sort (JISComparer.Instance);
1716 foreach (JISCharacter j in jisJapanese)
1717 AddCharMap ((char) j.CP, 0x22, 1);
1718 // non-JIS Japanese square chars.
1719 nonJisJapanese.Sort (NonJISComparer.Instance);
1720 foreach (NonJISCharacter j in nonJisJapanese)
1721 AddCharMap ((char) j.CP, 0x22, 1);
1724 fillIndex [0x23] = 0x02;
1725 for (int i = 0x3105; i <= 0x312C; i++)
1726 AddCharMap ((char) i, 0x23, 1);
1728 // Estrangela: ancient Syriac
1729 fillIndex [0x24] = 0x0B;
1730 // FIXME: is 0x71E really alternative form?
1731 ArrayList syriacAlternatives = new ArrayList (
1732 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1733 for (int i = 0x0710; i <= 0x072C; i++) {
1734 if (i == 0x0711) // NonSpacingMark
1736 if (syriacAlternatives.Contains (i))
1738 AddCharMap ((char) i, 0x24, 4);
1743 foreach (int cp in syriacAlternatives)
1744 map [cp] = new CharMapEntry (0x24,
1745 (byte) (map [cp - 1].Level1 + 2),
1749 // FIXME: it turned out that it does not look like UCA
1750 fillIndex [0x24] = 0x6E;
1751 for (int i = 0; i < orderedThaana.Length; i++) {
1752 if (IsIgnorableNonSpacing (i))
1754 AddCharMap (orderedThaana [i], 0x24, 2);
1758 // FIXME: Add more culture-specific letters (that are
1759 // not supported in Windows collation) here.
1761 // Surrogate ... they are computed.
1766 // Unlike UCA Windows Hangul sequence mixes Jongseong
1767 // with Choseong sequence as well as Jungseong,
1768 // adjusted to have the same primary weight for the
1769 // same base character. So it is impossible to compute
1772 // Here I introduce an ordered sequence of mixed
1773 // 'commands' and 'characters' that is similar to
1775 // - ',' increases primary weight.
1776 // - [A B] means a range, increasing index
1777 // - {A B} means a range, without increasing index
1778 // - '=' is no operation (it means the characters
1779 // of both sides have the same weight).
1780 // - '>' inserts a Hangul Syllable block that
1781 // contains 0x251 characters.
1782 // - '<' decreases the index
1783 // - '0'-'9' means skip count
1784 // - whitespaces are ignored
1787 string hangulSequence =
1788 + "\u1100=\u11A8 > \u1101=\u11A9 >"
1789 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1790 + "<{\u1113 \u1116}, \u3165,"
1791 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1792 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
1793 + "\u11CA, \u1104, \u11CB > \u1105 >"
1794 + "\u11B0, [\u11CC \u11D0], \u11B1, [\u11D1 \u11D2],"
1795 + "\u11B2, [\u11D3 \u11D5], \u11B3,"
1796 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
1797 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
1798 + "[\u11DA \u11E2], \u1107=\u11B8 >"
1799 + "<{\u111E \u1120}, \u3172,, \u3173, "
1800 + "\u11E3, \u1108 >"
1801 + "\u11B9,,,,,,,,, [\u11E4 \u11E6],, \u1109=\u11BA,,,"
1802 + "\u3214=\u3274 <>"
1803 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
1804 + "\u11EA,, \u110A=\u11BB,,, >"
1805 + "{\u1134 \u1140}, \u317E,,,,,, \u11EB,"
1806 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
1807 + "\u11EE, \u11EC, \u11ED,,,,, \u11F1,, \u11F2,,,"
1808 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
1810 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
1811 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
1812 + "\u1110=\u11C0 > \u1111=\u11C1 >"
1813 + "\u11F3, \u11F4, \u1112=\u11C2 >"
1814 + "\u11F9, [\u11F5 \u11F8]"
1817 byte hangulCat = 0x52;
1818 fillIndex [hangulCat] = 0x2;
1820 int syllableBlock = 0;
1821 for (int n = 0; n < hangulSequence.Length; n++) {
1822 char c = hangulSequence [n];
1824 if (Char.IsWhiteSpace (c))
1830 IncrementSequentialIndex (ref hangulCat);
1833 if (fillIndex [hangulCat] == 2)
1834 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
1835 fillIndex [hangulCat]--;
1838 IncrementSequentialIndex (ref hangulCat);
1839 for (int l = 0; l < 0x15; l++)
1840 for (int v = 0; v < 0x1C; v++) {
1842 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
1843 IncrementSequentialIndex (ref hangulCat);
1848 start = hangulSequence [n + 1];
1849 end = hangulSequence [n + 3];
1850 for (int i = start; i <= end; i++) {
1851 AddCharMap ((char) i, hangulCat, 0);
1853 IncrementSequentialIndex (ref hangulCat);
1855 n += 4; // consumes 5 characters for this operation
1858 start = hangulSequence [n + 1];
1859 end = hangulSequence [n + 3];
1860 for (int i = start; i <= end; i++)
1861 AddCharMap ((char) i, hangulCat, 0);
1862 n += 4; // consumes 5 characters for this operation
1865 AddCharMap (c, hangulCat, 0);
1872 // Letterlike characters and CJK compatibility square
1873 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
1874 int [] counts = new int ['Z' - 'A' + 1];
1875 char [] namedChars = new char [sortableCharNames.Count];
1877 foreach (DictionaryEntry de in sortableCharNames) {
1878 counts [((string) de.Value) [0] - 'A']++;
1879 namedChars [nCharNames++] = (char) ((int) de.Key);
1881 nCharNames = 0; // reset
1882 for (int a = 0; a < counts.Length; a++) {
1883 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
1884 for (int i = 0; i < counts [a]; i++)
1885 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
1886 AddCharMap (namedChars [nCharNames++], 0xE, 1);
1889 // CJK unified ideograph.
1891 fillIndex [cjkCat] = 0x2;
1892 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
1893 if (!IsIgnorable (cp))
1894 AddCharMapGroupCJK ((char) cp, ref cjkCat);
1895 // CJK Extensions goes here.
1896 // LAMESPEC: With this Windows style CJK layout, it is
1897 // impossible to add more CJK ideograph i.e. 0x9FA6-
1898 // 0x9FBB can never be added w/o breaking compat.
1899 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
1900 if (!IsIgnorable (cp))
1901 AddCharMapGroupCJK ((char) cp, ref cjkCat);
1903 // PrivateUse ... computed.
1904 // remaining Surrogate ... computed.
1906 #region Special "biggest" area (FF FF)
1907 fillIndex [0xFF] = 0xFF;
1908 char [] specialBiggest = new char [] {
1909 '\u3005', '\u3031', '\u3032', '\u309D',
1910 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1911 '\uFE7C', '\uFE7D', '\uFF70'};
1912 foreach (char c in specialBiggest)
1913 AddCharMap (c, 0xFF, 0);
1916 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
1917 // non-alphanumeric ASCII except for: + - < = > '
1918 for (int i = 0x21; i < 0x7F; i++) {
1919 if (Char.IsLetterOrDigit ((char) i)
1920 || "+-<=>'".IndexOf ((char) i) >= 0)
1921 continue; // they are not added here.
1922 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
1923 // Insert 3001 after ',' and 3002 after '.'
1925 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
1926 else if (i == 0x2E) {
1928 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
1931 AddCharMap ('\uFE30', 0x7, 1, 0);
1935 #region 07 - Punctuations and something else
1936 for (int i = 0xA0; i < char.MaxValue; i++) {
1937 if (IsIgnorable (i))
1948 switch (Char.GetUnicodeCategory ((char) i)) {
1949 case UnicodeCategory.OtherPunctuation:
1950 case UnicodeCategory.ClosePunctuation:
1951 case UnicodeCategory.InitialQuotePunctuation:
1952 case UnicodeCategory.FinalQuotePunctuation:
1953 case UnicodeCategory.ModifierSymbol:
1954 // SPECIAL CASES: // 0xA
1955 if (0x2020 <= i && i <= 0x2042)
1957 AddCharMapGroup ((char) i, 0x7, 1, 0);
1960 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
1961 goto case UnicodeCategory.OtherPunctuation;
1967 // FIXME: for 07 xx we need more love.
1969 // Characters w/ diacritical marks (NFKD)
1970 for (int i = 0; i <= char.MaxValue; i++) {
1971 if (map [i].Defined || IsIgnorable (i))
1973 if (decompIndex [i] == 0)
1976 int start = decompIndex [i];
1977 int primaryChar = decompValues [start];
1980 int length = decompLength [i];
1981 // special processing for parenthesized ones.
1983 decompValues [start] == '(' &&
1984 decompValues [start + 2] == ')') {
1985 primaryChar = decompValues [start + 1];
1989 if (map [primaryChar].Level1 == 0)
1992 for (int l = 1; l < length; l++) {
1993 int c = decompValues [start + l];
1994 if (map [c].Level1 != 0)
1996 secondary += diacritical [c];
2000 map [i] = new CharMapEntry (
2001 map [primaryChar].Category,
2002 map [primaryChar].Level1,
2007 #region Level2 adjustment
2009 diacritical [0x624] = 0x5;
2010 diacritical [0x626] = 0x7;
2011 diacritical [0x622] = 0x9;
2012 diacritical [0x623] = 0xA;
2013 diacritical [0x625] = 0xB;
2014 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2015 diacritical [0x64A] = 0x7; // Yaa'
2018 for (int i = 0; i < char.MaxValue; i++) {
2020 byte cat = map [i].Category;
2022 case 0xE: // Latin diacritics
2023 case 0x22: // Japanese: circled characters
2024 mod = diacritical [i];
2026 case 0x13: // Arabic
2027 if (diacritical [i] == 0)
2028 mod = 0x8; // default for arabic
2031 if (0x52 <= cat && cat <= 0x7F) // Hangul
2032 mod = diacritical [i];
2034 map [i] = new CharMapEntry (
2035 cat, map [i].Level1, mod);
2039 // FIXME: this is hack but those which are
2040 // NonSpacingMark characters and still undefined
2041 // are likely to be nonspacing.
2042 for (int i = 0; i < char.MaxValue; i++)
2043 if (!map [i].Defined &&
2045 Char.GetUnicodeCategory ((char) i) ==
2046 UnicodeCategory.NonSpacingMark)
2047 AddCharMap ((char) i, 1, 1);
2050 private void IncrementSequentialIndex (ref byte hangulCat)
2052 fillIndex [hangulCat]++;
2053 if (fillIndex [hangulCat] == 0) { // overflown
2055 fillIndex [hangulCat] = 0x2;
2059 // Reset fillIndex to fixed value and call AddLetterMap().
2060 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2062 fillIndex [category] = alphaWeight;
2063 AddLetterMap (c, category, 0);
2065 ArrayList al = latinMap [c] as ArrayList;
2069 foreach (int cp in al)
2070 AddLetterMap ((char) cp, category, 0);
2073 private void AddKanaMap (int i, byte voices)
2075 for (byte b = 0; b < voices; b++) {
2076 char c = (char) (i + b);
2077 byte arg = (byte) (b > 0 ? b + 2 : 0);
2079 AddLetterMapCore (c, 0x22, 0, arg);
2081 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2085 private void AddLetterMap (char c, byte category, byte updateCount)
2087 AddLetterMapCore (c, category, updateCount, 0);
2090 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2093 // <small> updates index
2094 c2 = ToSmallForm (c);
2096 AddCharMapGroup (c2, category, updateCount, level2);
2097 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2098 if (c2 != c && !map [(int) c2].Defined)
2099 AddLetterMapCore (c2, category, 0, level2);
2100 bool doUpdate = true;
2101 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2104 AddCharMapGroup (c, category, 0, level2);
2106 fillIndex [category] += updateCount;
2109 private bool AddCharMap (char c, byte category, byte increment)
2111 return AddCharMap (c, category, increment, 0);
2114 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2116 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2117 return false; // do nothing
2118 map [(int) c] = new CharMapEntry (category,
2119 category == 1 ? alt : fillIndex [category],
2120 category == 1 ? fillIndex [category] : alt);
2121 fillIndex [category] += increment;
2125 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2127 char c2 = ToSmallFormTail (c);
2129 AddCharMap (c2, category, updateCount, 0);
2131 AddCharMap (c, category, updateCount, 0);
2133 c2 = ToFullWidthTail (c);
2135 AddCharMapGroupTail (c2, category, updateCount);
2139 // Adds characters to table in the order below
2140 // (+ increases weight):
2144 // <full> | <super> | <sub>
2145 // <circle> | <wide> (| <narrow>)
2149 // level2 is fixed (does not increase).
2150 int [] sameWeightItems = new int [] {
2151 DecompositionFraction,
2155 DecompositionCircle,
2157 DecompositionNarrow,
2159 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2161 if (map [(int) c].Defined)
2164 char small = char.MinValue;
2165 char vertical = char.MinValue;
2166 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2168 object smv = nfkd [(byte) DecompositionSmall];
2170 small = (char) ((int) smv);
2171 object vv = nfkd [(byte) DecompositionVertical];
2173 vertical = (char) ((int) vv);
2176 // <small> updates index
2177 if (small != char.MinValue)
2178 AddCharMap (small, category, updateCount);
2181 AddCharMap (c, category, 0, level2);
2184 foreach (int weight in sameWeightItems) {
2185 object wv = nfkd [(byte) weight];
2187 AddCharMap ((char) ((int) wv), category, 0, level2);
2191 // update index here.
2192 fillIndex [category] += updateCount;
2194 if (vertical != char.MinValue)
2195 AddCharMap (vertical, category, updateCount, level2);
2198 private void AddCharMapCJK (char c, ref byte category)
2200 AddCharMap (c, category, 0, 0);
2201 IncrementSequentialIndex (ref category);
2203 // Special. I wonder why but Windows skips 9E F9.
2204 if (category == 0x9E && fillIndex [category] == 0xF9)
2205 IncrementSequentialIndex (ref category);
2208 private void AddCharMapGroupCJK (char c, ref byte category)
2210 AddCharMapCJK (c, ref category);
2212 // LAMESPEC: see below.
2213 if (c == '\u52DE') {
2214 AddCharMapCJK ('\u3298', ref category);
2215 AddCharMapCJK ('\u3238', ref category);
2218 AddCharMapCJK ('\u32A2', ref category);
2220 // Especially this mapping order totally does
2221 // not make sense to me.
2222 AddCharMapCJK ('\u32A9', ref category);
2224 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2227 for (byte weight = 0; weight <= 0x12; weight++) {
2228 object wv = nfkd [weight];
2233 // Special: they are ignored in this area.
2234 // FIXME: check if it is sane
2235 if (0xF900 <= w && w <= 0xFAD9)
2237 // LAMESPEC: on Windows some of CJK characters
2238 // in 3200-32B0 are incorrectly mapped. They
2239 // mix Chinise and Japanese Kanji when
2240 // ordering those characters.
2242 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2246 AddCharMapCJK ((char) w, ref category);
2250 // For now it is only for 0x7 category.
2251 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2253 char small = char.MinValue;
2254 char vertical = char.MinValue;
2255 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2257 object smv = nfkd [(byte) DecompositionSmall];
2259 small = (char) ((int) smv);
2260 object vv = nfkd [(byte) DecompositionVertical];
2262 vertical = (char) ((int) vv);
2265 // <small> updates index
2266 if (small != char.MinValue)
2267 // SPECIAL CASE excluded (FIXME: why?)
2268 if (small != '\u2024')
2269 AddCharMap (small, category, updateCount);
2272 AddCharMap (c, category, updateCount, level2);
2274 // Since nfkdMap is problematic to have two or more
2275 // NFKD to an identical character, here I iterate all.
2276 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2277 if (decompLength [c2] == 1 &&
2278 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2279 switch (decompType [c2]) {
2280 case DecompositionCompat:
2281 AddCharMap ((char) c2, category, updateCount, level2);
2287 if (vertical != char.MinValue)
2288 // SPECIAL CASE excluded (FIXME: why?)
2289 if (vertical != '\uFE33' && vertical != '\uFE34')
2290 AddCharMap (vertical, category, updateCount, level2);
2293 char ToFullWidth (char c)
2295 return ToDecomposed (c, DecompositionFull, false);
2298 char ToFullWidthTail (char c)
2300 return ToDecomposed (c, DecompositionFull, true);
2303 char ToSmallForm (char c)
2305 return ToDecomposed (c, DecompositionSmall, false);
2308 char ToSmallFormTail (char c)
2310 return ToDecomposed (c, DecompositionSmall, true);
2313 char ToDecomposed (char c, byte d, bool tail)
2315 if (decompType [(int) c] != d)
2317 int idx = decompIndex [(int) c];
2319 idx += decompLength [(int) c] - 1;
2320 return (char) decompValues [idx];
2323 bool ExistsJIS (int cp)
2325 foreach (JISCharacter j in jisJapanese)
2333 #region Level 3 properties (Case/Width)
2335 private byte ComputeLevel3Weight (char c)
2337 byte b = ComputeLevel3WeightRaw (c);
2338 return b > 0 ? (byte) (b + 2) : b;
2341 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2344 if ('\u11A8' <= c && c <= '\u11F9')
2346 if ('\uFFA0' <= c && c <= '\uFFDC')
2348 if ('\u3130' <= c && c <= '\u3164')
2351 if ('\u2776' <= c && c <= '\u277F')
2353 if ('\u2780' <= c && c <= '\u2789')
2355 if ('\u2776' <= c && c <= '\u2793')
2357 if ('\u2160' <= c && c <= '\u216F')
2359 if ('\u2181' <= c && c <= '\u2182')
2362 if ('\u2135' <= c && c <= '\u2138')
2364 if ('\uFE80' <= c && c < '\uFE8E') {
2365 // 2(Isolated)/8(Final)/0x18(Medial)
2366 switch (decompType [(int) c]) {
2367 case DecompositionIsolated:
2369 case DecompositionFinal:
2371 case DecompositionMedial:
2376 // actually I dunno the reason why they have weights.
2399 switch (decompType [(int) c]) {
2400 case DecompositionWide: // <wide>
2401 case DecompositionSub: // <sub>
2402 case DecompositionSuper: // <super>
2403 ret |= decompType [(int) c];
2406 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2408 if (isUppercase [(int) c]) // DerivedCoreProperties
2417 // FIXME: In the future use DerivedAge.txt to examine character
2418 // versions and set those ones that have higher version than
2419 // 1.0 as ignorable.
2420 static bool IsIgnorable (int i)
2424 // I guess, those characters are added between
2425 // Unicode 1.0 (LCMapString) and Unicode 3.1
2426 // (UnicodeCategory), so they used to be
2427 // something like OtherNotAssigned as of Unicode 1.1.
2428 case 0x2df: case 0x387:
2429 case 0x3d7: case 0x3d8: case 0x3d9:
2430 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2431 case 0x400: case 0x40d: case 0x450: case 0x45d:
2432 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2433 case 0x653: case 0x654: case 0x655: case 0x66d:
2435 case 0x1e9b: case 0x202f: case 0x20ad:
2436 case 0x20ae: case 0x20af:
2437 case 0x20e2: case 0x20e3:
2438 case 0x2139: case 0x213a: case 0x2183:
2439 case 0x2425: case 0x2426: case 0x2619:
2440 case 0x2670: case 0x2671: case 0x3007:
2441 case 0x3190: case 0x3191:
2442 case 0xfffc: case 0xfffd:
2444 // exceptional characters filtered by the
2445 // following conditions. Originally those exceptional
2446 // ranges are incorrect (they should not be ignored)
2447 // and most of those characters are unfortunately in
2449 case 0x4d8: case 0x4d9:
2450 case 0x4e8: case 0x4e9:
2451 case 0x3036: case 0x303f:
2452 case 0x337b: case 0xfb1e:
2457 // The whole Sinhala characters.
2458 0x0D82 <= i && i <= 0x0DF4
2459 // The whole Tibetan characters.
2460 || 0x0F00 <= i && i <= 0x0FD1
2461 // The whole Myanmar characters.
2462 || 0x1000 <= i && i <= 0x1059
2463 // The whole Etiopic, Cherokee,
2464 // Canadian Syllablic, Ogham, Runic,
2465 // Tagalog, Hanunoo, Philippine,
2466 // Buhid, Tagbanwa, Khmer and Mongorian
2468 || 0x1200 <= i && i <= 0x1DFF
2469 // Greek extension characters.
2470 || 0x1F00 <= i && i <= 0x1FFF
2471 // The whole Braille characters.
2472 || 0x2800 <= i && i <= 0x28FF
2473 // CJK radical characters.
2474 || 0x2E80 <= i && i <= 0x2EF3
2475 // Kangxi radical characters.
2476 || 0x2F00 <= i && i <= 0x2FD5
2477 // Ideographic description characters.
2478 || 0x2FF0 <= i && i <= 0x2FFB
2479 // Bopomofo letter and final
2480 || 0x31A0 <= i && i <= 0x31B7
2481 // White square with quadrant characters.
2482 || 0x25F0 <= i && i <= 0x25F7
2483 // Ideographic telegraph symbols.
2484 || 0x32C0 <= i && i <= 0x32CB
2485 || 0x3358 <= i && i <= 0x3370
2486 || 0x33E0 <= i && i <= 0x33FF
2487 // The whole YI characters.
2488 || 0xA000 <= i && i <= 0xA48C
2489 || 0xA490 <= i && i <= 0xA4C6
2490 // American small ligatures
2491 || 0xFB13 <= i && i <= 0xFB17
2492 // hebrew, arabic, variation selector.
2493 || 0xFB1D <= i && i <= 0xFE2F
2494 // Arabic ligatures.
2495 || 0xFEF5 <= i && i <= 0xFEFC
2496 // FIXME: why are they excluded?
2497 || 0x01F6 <= i && i <= 0x01F9
2498 || 0x0218 <= i && i <= 0x0233
2499 || 0x02A9 <= i && i <= 0x02AD
2500 || 0x02EA <= i && i <= 0x02EE
2501 || 0x0349 <= i && i <= 0x036F
2502 || 0x0488 <= i && i <= 0x048F
2503 || 0x04D0 <= i && i <= 0x04FF
2504 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2505 || 0x06D6 <= i && i <= 0x06ED
2506 || 0x06FA <= i && i <= 0x06FE
2507 || 0x2048 <= i && i <= 0x204D
2508 || 0x20e4 <= i && i <= 0x20ea
2509 || 0x213C <= i && i <= 0x214B
2510 || 0x21EB <= i && i <= 0x21FF
2511 || 0x22F2 <= i && i <= 0x22FF
2512 || 0x237B <= i && i <= 0x239A
2513 || 0x239B <= i && i <= 0x23CF
2514 || 0x24EB <= i && i <= 0x24FF
2515 || 0x2596 <= i && i <= 0x259F
2516 || 0x25F8 <= i && i <= 0x25FF
2517 || 0x2672 <= i && i <= 0x2689
2518 || 0x2768 <= i && i <= 0x2775
2519 || 0x27d0 <= i && i <= 0x27ff
2520 || 0x2900 <= i && i <= 0x2aff
2521 || 0x3033 <= i && i <= 0x303F
2522 || 0x31F0 <= i && i <= 0x31FF
2523 || 0x3250 <= i && i <= 0x325F
2524 || 0x32B1 <= i && i <= 0x32BF
2525 || 0x3371 <= i && i <= 0x337B
2526 || 0xFA30 <= i && i <= 0xFA6A
2530 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2532 case UnicodeCategory.PrivateUse:
2533 case UnicodeCategory.Surrogate:
2535 // ignored by nature
2536 case UnicodeCategory.Format:
2537 case UnicodeCategory.OtherNotAssigned:
2544 // To check IsIgnorable sanity, try the driver below under MS.NET.
2547 public static void Main ()
2549 for (int i = 0; i <= char.MaxValue; i++)
2550 Dump (i, IsIgnorable (i));
2553 static void Dump (int i, bool ignore)
2555 switch (Char.GetUnicodeCategory ((char) i)) {
2556 case UnicodeCategory.PrivateUse:
2557 case UnicodeCategory.Surrogate:
2558 return; // check nothing
2562 string s2 = new string ((char) i, 10);
2563 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2564 if ((ret == 0) == ignore)
2566 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2569 #endregion // IsIgnorable
2571 #region IsIgnorableSymbol
2572 static bool IsIgnorableSymbol (int i)
2574 if (IsIgnorable (i))
2579 case 0x00b5: case 0x01C0: case 0x01C1:
2580 case 0x01C2: case 0x01C3: case 0x01F6:
2581 case 0x01F7: case 0x01F8: case 0x01F9:
2582 case 0x02D0: case 0x02EE: case 0x037A:
2583 case 0x03D7: case 0x03F3:
2584 case 0x0400: case 0x040d:
2585 case 0x0450: case 0x045d:
2586 case 0x048C: case 0x048D:
2587 case 0x048E: case 0x048F:
2588 case 0x0587: case 0x0640: case 0x06E5:
2589 case 0x06E6: case 0x06FA: case 0x06FB:
2590 case 0x06FC: case 0x093D: case 0x0950:
2591 case 0x1E9B: case 0x2139: case 0x3006:
2592 case 0x3033: case 0x3034: case 0x3035:
2593 case 0xFE7E: case 0xFE7F:
2595 case 0x16EE: case 0x16EF: case 0x16F0:
2597 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2598 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2599 case 0x3038: // HANGZHOU NUMERAL TEN
2600 case 0x3039: // HANGZHOU NUMERAL TWENTY
2601 case 0x303a: // HANGZHOU NUMERAL THIRTY
2607 case 0x02B9: case 0x02BA: case 0x02C2:
2608 case 0x02C3: case 0x02C4: case 0x02C5:
2609 case 0x02C8: case 0x02CC: case 0x02CD:
2610 case 0x02CE: case 0x02CF: case 0x02D2:
2611 case 0x02D3: case 0x02D4: case 0x02D5:
2612 case 0x02D6: case 0x02D7: case 0x02DE:
2613 case 0x02E5: case 0x02E6: case 0x02E7:
2614 case 0x02E8: case 0x02E9:
2615 case 0x309B: case 0x309C:
2617 case 0x055A: // American Apos
2618 case 0x05C0: // Hebrew Punct
2619 case 0x0E4F: // Thai FONGMAN
2620 case 0x0E5A: // Thai ANGKHANKHU
2621 case 0x0E5B: // Thai KHOMUT
2623 case 0x09F2: // Bengali Rupee Mark
2624 case 0x09F3: // Bengali Rupee Sign
2626 case 0x221e: // INF.
2635 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2637 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2638 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2643 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2645 case UnicodeCategory.Surrogate:
2646 return false; // inconsistent
2648 case UnicodeCategory.SpacingCombiningMark:
2649 case UnicodeCategory.EnclosingMark:
2650 case UnicodeCategory.NonSpacingMark:
2651 case UnicodeCategory.PrivateUse:
2653 if (0x064B <= i && i <= 0x0652) // Arabic
2657 case UnicodeCategory.Format:
2658 case UnicodeCategory.OtherNotAssigned:
2665 // latin in a circle
2666 0x249A <= i && i <= 0x24E9
2667 || 0x2100 <= i && i <= 0x2132
2669 || 0x3196 <= i && i <= 0x31A0
2671 || 0x3200 <= i && i <= 0x321C
2673 || 0x322A <= i && i <= 0x3243
2675 || 0x3260 <= i && i <= 0x32B0
2676 || 0x32D0 <= i && i <= 0x3357
2677 || 0x337B <= i && i <= 0x33DD
2679 use = !Char.IsLetterOrDigit ((char) i);
2683 // This "Digit" rule is mystery.
2684 // It filters some symbols out.
2685 if (Char.IsLetterOrDigit ((char) i))
2687 if (Char.IsNumber ((char) i))
2689 if (Char.IsControl ((char) i)
2690 || Char.IsSeparator ((char) i)
2691 || Char.IsPunctuation ((char) i))
2693 if (Char.IsSymbol ((char) i))
2696 // FIXME: should check more
2701 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2703 public static void Main ()
2705 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2706 for (int i = 0; i <= char.MaxValue; i++) {
2707 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2708 if (uc == UnicodeCategory.Surrogate)
2711 bool ret = IsIgnorableSymbol (i);
2713 string s1 = "TEST ";
2714 string s2 = "TEST " + (char) i;
2716 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2718 if (ret != (result == 0))
2719 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2720 ret ? "should not ignore" :
2729 static bool IsIgnorableNonSpacing (int i)
2731 if (IsIgnorable (i))
2735 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2736 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2737 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2739 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2740 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2741 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2742 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2743 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2744 case 0x0CCD: case 0x0E4E:
2748 if (0x02b9 <= i && i <= 0x02c5
2749 || 0x02cc <= i && i <= 0x02d7
2750 || 0x02e4 <= i && i <= 0x02ef
2751 || 0x20DD <= i && i <= 0x20E0
2755 if (0x064B <= i && i <= 0x00652
2756 || 0x0941 <= i && i <= 0x0948
2757 || 0x0AC1 <= i && i <= 0x0ACD
2758 || 0x0C3E <= i && i <= 0x0C4F
2759 || 0x0E31 <= i && i <= 0x0E3F
2763 return Char.GetUnicodeCategory ((char) i) ==
2764 UnicodeCategory.NonSpacingMark;
2767 // We can reuse IsIgnorableSymbol testcode
2768 // for IsIgnorableNonSpacing.
2774 public byte Category;
2776 public byte Level2; // It is always single byte.
2777 public bool Defined;
2779 public CharMapEntry (byte category, byte level1, byte level2)
2781 Category = category;
2790 public readonly int CP;
2791 public readonly int JIS;
2793 public JISCharacter (int cp, int cpJIS)
2800 class JISComparer : IComparer
2802 public static readonly JISComparer Instance =
2805 public int Compare (object o1, object o2)
2807 JISCharacter j1 = (JISCharacter) o1;
2808 JISCharacter j2 = (JISCharacter) o2;
2809 return j2.JIS - j1.JIS;
2813 class NonJISCharacter
2815 public readonly int CP;
2816 public readonly string Name;
2818 public NonJISCharacter (int cp, string name)
2825 class NonJISComparer : IComparer
2827 public static readonly NonJISComparer Instance =
2828 new NonJISComparer ();
2830 public int Compare (object o1, object o2)
2832 NonJISCharacter j1 = (NonJISCharacter) o1;
2833 NonJISCharacter j2 = (NonJISCharacter) o2;
2834 return string.CompareOrdinal (j1.Name, j2.Name);
2838 class DecimalDictionaryValueComparer : IComparer
2840 public static readonly DecimalDictionaryValueComparer Instance
2841 = new DecimalDictionaryValueComparer ();
2843 private DecimalDictionaryValueComparer ()
2847 public int Compare (object o1, object o2)
2849 DictionaryEntry e1 = (DictionaryEntry) o1;
2850 DictionaryEntry e2 = (DictionaryEntry) o2;
2851 // FIXME: in case of 0, compare decomposition categories
2852 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
2855 int i1 = (int) e1.Key;
2856 int i2 = (int) e2.Key;
2861 class StringDictionaryValueComparer : IComparer
2863 public static readonly StringDictionaryValueComparer Instance
2864 = new StringDictionaryValueComparer ();
2866 private StringDictionaryValueComparer ()
2870 public int Compare (object o1, object o2)
2872 DictionaryEntry e1 = (DictionaryEntry) o1;
2873 DictionaryEntry e2 = (DictionaryEntry) o2;
2874 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
2877 int i1 = (int) e1.Key;
2878 int i2 = (int) e2.Key;
2883 class UCAComparer : IComparer
2885 public static readonly UCAComparer Instance
2886 = new UCAComparer ();
2888 private UCAComparer ()
2892 public int Compare (object o1, object o2)
2894 char i1 = (char) o1;
2895 char i2 = (char) o2;
2897 int l1 = CollationElementTable.GetSortKeyCount (i1);
2898 int l2 = CollationElementTable.GetSortKeyCount (i2);
2899 int l = l1 > l2 ? l2 : l1;
2901 for (int i = 0; i < l; i++) {
2902 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
2903 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
2904 int v = k1.Primary - k2.Primary;
2907 v = k1.Secondary - k2.Secondary;
2910 v = k1.Thirtiary - k2.Thirtiary;
2913 v = k1.Quarternary - k2.Quarternary;