3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
18 using System.Collections;
19 using System.Globalization;
23 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
25 namespace Mono.Globalization.Unicode
27 internal class MSCompatSortKeyTableGenerator
29 public static void Main (string [] args)
31 new MSCompatSortKeyTableGenerator ().Run (args);
34 const int DecompositionWide = 1; // fixed
35 const int DecompositionSub = 2; // fixed
36 const int DecompositionSmall = 3;
37 const int DecompositionIsolated = 4;
38 const int DecompositionInitial = 5;
39 const int DecompositionFinal = 6;
40 const int DecompositionMedial = 7;
41 const int DecompositionNoBreak = 8;
42 const int DecompositionVertical = 9;
43 const int DecompositionFraction = 0xA;
44 const int DecompositionFont = 0xB;
45 const int DecompositionSuper = 0xC; // fixed
46 const int DecompositionFull = 0xE;
47 const int DecompositionNarrow = 0xD;
48 const int DecompositionCircle = 0xF;
49 const int DecompositionSquare = 0x10;
50 const int DecompositionCompat = 0x11;
51 const int DecompositionCanonical = 0x12;
53 TextWriter CSResult = Console.Out;
54 TextWriter CResult = TextWriter.Null;
56 byte [] fillIndex = new byte [256]; // by category
57 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
59 char [] specialIgnore = new char [] {
60 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
61 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
64 // FIXME: need more love (as always)
65 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
66 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
67 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
68 '\u0292', '\u01BE', '\u0298'};
69 byte [] alphaWeights = new byte [] {
70 2, 9, 0xA, 0x1A, 0x21,
71 0x23, 0x25, 0x2C, 0x32, 0x35,
72 0x36, 0x48, 0x51, 0x70, 0x7C,
73 0x7E, 0x89, 0x8A, 0x91, 0x99,
74 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
75 0xA9, 0xAA, 0xB3, 0xB4};
77 bool [] isSmallCapital = new bool [char.MaxValue + 1];
78 bool [] isUppercase = new bool [char.MaxValue + 1];
80 byte [] decompType = new byte [char.MaxValue + 1];
81 int [] decompIndex = new int [char.MaxValue + 1];
82 int [] decompLength = new int [char.MaxValue + 1];
84 decimal [] decimalValue = new decimal [char.MaxValue + 1];
86 byte [] diacritical = new byte [char.MaxValue + 1];
88 string [] diacritics = new string [] {
89 // LATIN, CYRILLIC etc.
90 "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
92 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
93 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
94 "WITH ACUTE;", "WITH GRAVE;",
96 "WITH DOT ABOVE;", " MIDDLE DOT;",
97 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
99 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
100 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
101 "ABKHASIAN CHE WITH DESCENDER",
102 "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
103 "WITH OGONEK;", "WITH CEDILLA;",
105 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
106 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
108 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
109 " DIAERESIS AND GRAVE;",
111 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
112 " MACRON AND ACUTE;",
113 " MACRON AND GRAVE;",
115 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
116 " RING ABOVE AND ACUTE",
117 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
118 " CIRCUMFLEX AND TILDE",
119 " TILDE AND DIAERESIS",
122 " CEDILLA AND BREVE",
123 " OGONEK AND MACRON",
125 "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
126 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
130 " PRECEDED BY APOSTROPHE",
132 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
135 " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
136 " RING BELOW", "LOW VERTICAL LINE",
138 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
139 " BREVE BELOW;", " HORN AND GRAVE",
143 " DOT BELOW AND DOT ABOVE",
144 " RIGHT HALF RING", " HORN AND TILDE",
145 " CIRCUMFLEX AND DOT BELOW",
146 " BREVE AND DOT BELOW",
147 " DOT BELOW AND MACRON",
149 " HORN AND HOOK ABOVE",
151 // CIRCLED, PARENTHESIZED and so on
152 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
153 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
154 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
156 byte [] diacriticWeights = new byte [] {
162 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
163 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
165 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
166 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
168 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
169 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
171 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
172 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
175 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
176 0x69, 0x69, 0x6A, 0x6D, 0x6E,
178 // CIRCLED, PARENTHESIZED and so on.
179 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
183 int [] numberSecondaryWeightBounds = new int [] {
184 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
185 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
186 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
187 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
188 0xE50, 0xE60, 0xED0, 0xEE0
191 char [] orderedGurmukhi;
192 char [] orderedGujarati;
193 char [] orderedGeorgian;
194 char [] orderedThaana;
196 static readonly char [] orderedTamilConsonants = new char [] {
197 // based on traditional Tamil consonants, except for
198 // Grantha (where Microsoft breaks traditionalism).
199 // http://www.angelfire.com/empire/thamizh/padanGaL
200 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
201 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
202 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
203 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
206 // cp -> character name (only for some characters)
207 ArrayList sortableCharNames = new ArrayList ();
209 // cp -> arrow value (int)
210 ArrayList arrowValues = new ArrayList ();
212 // cp -> box value (int)
213 ArrayList boxValues = new ArrayList ();
215 // cp -> level1 value
216 Hashtable arabicLetterPrimaryValues = new Hashtable ();
219 Hashtable arabicNameMap = new Hashtable ();
221 // cp -> Hashtable [decompType] -> cp
222 Hashtable nfkdMap = new Hashtable ();
224 // Latin letter -> ArrayList [int]
225 Hashtable latinMap = new Hashtable ();
227 ArrayList jisJapanese = new ArrayList ();
228 ArrayList nonJisJapanese = new ArrayList ();
230 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
231 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
232 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
233 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
234 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
236 byte [] ignorableFlags = new byte [char.MaxValue + 1];
238 static double [] unicodeAge = new double [char.MaxValue + 1];
240 ArrayList tailorings = new ArrayList ();
242 void Run (string [] args)
244 string dirname = args.Length == 0 ? "downloaded" : args [0];
245 ParseSources (dirname);
246 Console.Error.WriteLine ("parse done.");
248 ModifyParsedValues ();
250 Console.Error.WriteLine ("generation done.");
251 CResult = new StreamWriter ("collation-tables.h", false);
254 Console.Error.WriteLine ("serialization done.");
256 StreamWriter sw = new StreamWriter ("agelog.txt");
257 for (int i = 0; i < char.MaxValue; i++) {
258 bool shouldBe = false;
259 switch (Char.GetUnicodeCategory ((char) i)) {
260 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
261 shouldBe = true; break;
263 if (unicodeAge [i] >= 3.1)
265 //if (IsIgnorable (i) != shouldBe)
266 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
272 byte [] CompressArray (byte [] source, CodePointIndexer i)
274 return (byte []) CodePointIndexer.CompressArray (
275 source, typeof (byte), i);
278 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
280 return (ushort []) CodePointIndexer.CompressArray (
281 source, typeof (ushort), i);
284 void WriteByte (byte value)
292 SerializeTailorings ();
294 byte [] categories = new byte [map.Length];
295 byte [] level1 = new byte [map.Length];
296 byte [] level2 = new byte [map.Length];
297 byte [] level3 = new byte [map.Length];
298 // widthCompat is now removed from the mapping table.
299 // If it turned out that it is still required, grep this source and uncomment
300 // widthCompat related lines. FIXME: remove those lines in the future.
301 // ushort [] widthCompat = new ushort [map.Length];
302 for (int i = 0; i < map.Length; i++) {
303 categories [i] = map [i].Category;
304 level1 [i] = map [i].Level1;
305 level2 [i] = map [i].Level2;
306 level3 [i] = ComputeLevel3Weight ((char) i);
308 // For Japanese Half-width characters, don't
309 // map widthCompat. It is IgnoreKanaType that
310 // handles those width differences.
311 if (0xFF6D <= i && i <= 0xFF9D)
313 switch (decompType [i]) {
314 case DecompositionNarrow:
315 case DecompositionWide:
316 case DecompositionSuper:
317 case DecompositionSub:
318 // they are always 1 char
319 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
326 ignorableFlags = CompressArray (ignorableFlags,
328 categories = CompressArray (categories, UUtil.Category);
329 level1 = CompressArray (level1, UUtil.Level1);
330 level2 = CompressArray (level2, UUtil.Level2);
331 level3 = CompressArray (level3, UUtil.Level3);
332 // widthCompat = (ushort []) CodePointIndexer.CompressArray (
333 // widthCompat, typeof (ushort), UUtil.WidthCompat);
334 cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
335 cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
336 cjkJA = CompressArray (cjkJA, UUtil.Cjk);
337 cjkKO = CompressArray (cjkKO, UUtil.Cjk);
338 cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
341 CResult.WriteLine ("static const guint8* collation_table_ignorableFlags [] = {");
342 CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
344 MemoryStream ms = new MemoryStream ();
345 BinaryWriter binary = new BinaryWriter (ms);
346 binary.Write (UUtil.ResourceVersion);
347 binary.Write (ignorableFlags.Length);
349 for (int i = 0; i < ignorableFlags.Length; i++) {
350 byte value = ignorableFlags [i];
352 CSResult.Write ("{0},", value);
354 CSResult.Write ("0x{0:X02},", value);
355 CResult.Write ("{0},", value);
357 binary.Write (value);
359 if ((i & 0xF) == 0xF) {
360 CSResult.WriteLine ("// {0:X04}",
361 UUtil.Ignorable.ToCodePoint (i - 0xF));
362 CResult.WriteLine ();
365 CSResult.WriteLine ("};");
366 CSResult.WriteLine ();
369 CResult.WriteLine ("static const guint8* collation_table_category [] = {");
370 CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
372 binary.Write (categories.Length);
374 for (int i = 0; i < categories.Length; i++) {
375 byte value = categories [i];
377 CSResult.Write ("{0},", value);
379 CSResult.Write ("0x{0:X02},", value);
380 CResult.Write ("{0},", value);
382 binary.Write (value);
384 if ((i & 0xF) == 0xF) {
385 CSResult.WriteLine ("// {0:X04}",
386 UUtil.Category.ToCodePoint (i - 0xF));
387 CResult.WriteLine ();
390 CResult.WriteLine ("};");
391 CSResult.WriteLine ("};");
392 CSResult.WriteLine ();
394 // Primary weight value
395 CResult.WriteLine ("static const guint8* collation_table_level1 [] = {");
396 CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
398 binary.Write (level1.Length);
400 for (int i = 0; i < level1.Length; i++) {
401 byte value = level1 [i];
403 CSResult.Write ("{0},", value);
405 CSResult.Write ("0x{0:X02},", value);
406 CResult.Write ("{0},", value);
408 binary.Write (value);
410 if ((i & 0xF) == 0xF) {
411 CSResult.WriteLine ("// {0:X04}",
412 UUtil.Level1.ToCodePoint (i - 0xF));
413 CResult.WriteLine ();
416 CResult.WriteLine ("0};");
417 CSResult.WriteLine ("};");
418 CSResult.WriteLine ();
421 CResult.WriteLine ("static const guint8* collation_table_level2 [] = {");
422 CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
424 binary.Write (level2.Length);
426 for (int i = 0; i < level2.Length; i++) {
427 byte value = level2 [i];
429 CSResult.Write ("{0},", value);
431 CSResult.Write ("0x{0:X02},", value);
432 CResult.Write ("{0},", value);
434 binary.Write (value);
436 if ((i & 0xF) == 0xF) {
437 CSResult.WriteLine ("// {0:X04}",
438 UUtil.Level2.ToCodePoint (i - 0xF));
439 CResult.WriteLine ();
442 CResult.WriteLine ("0};");
443 CSResult.WriteLine ("};");
444 CSResult.WriteLine ();
447 CResult.WriteLine ("static const guint8* collation_table_level3 [] = {");
448 CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
450 binary.Write (level3.Length);
452 for (int i = 0; i < level3.Length; i++) {
453 byte value = level3 [i];
455 CSResult.Write ("{0},", value);
457 CSResult.Write ("0x{0:X02},", value);
458 CResult.Write ("{0},", value);
460 binary.Write (value);
462 if ((i & 0xF) == 0xF) {
463 CSResult.WriteLine ("// {0:X04}",
464 UUtil.Level3.ToCodePoint (i - 0xF));
465 CResult.WriteLine ();
468 CResult.WriteLine ("0};");
469 CSResult.WriteLine ("};");
470 CSResult.WriteLine ();
473 // Width insensitivity mappings
474 // (for now it is more lightweight than dumping the
475 // entire NFKD table).
476 CResult.WriteLine ("static const guint16* widthCompat [] = {");
477 CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
479 binary.Write (widthCompat.Length);
481 for (int i = 0; i < widthCompat.Length; i++) {
482 ushort value = widthCompat [i];
484 CSResult.Write ("{0},", value);
486 CSResult.Write ("0x{0:X02},", value);
487 CResult.Write ("{0},", value);
489 binary.Write (value);
491 if ((i & 0xF) == 0xF) {
492 CSResult.WriteLine ("// {0:X04}",
493 UUtil.WidthCompat.ToCodePoint (i - 0xF));
494 CResult.WriteLine ();
497 CResult.WriteLine ("0};");
498 CSResult.WriteLine ("};");
499 CSResult.WriteLine ();
503 using (FileStream fs = File.Create ("../collation.core.bin")) {
504 byte [] array = ms.ToArray ();
505 fs.Write (array, 0, array.Length);
510 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
511 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
512 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
513 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
514 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
517 void SerializeCJK (string name, ushort [] cjk, int max_unused)
519 CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
520 CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
522 CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
523 CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
525 MemoryStream ms = new MemoryStream ();
526 BinaryWriter binary = new BinaryWriter (ms);
527 binary.Write (UUtil.ResourceVersion);
528 binary.Write (cjk.Length); // the actual size is *2.
531 for (int i = 0; i < cjk.Length; i++) {
534 byte value = (byte) (cjk [i] >> 8);
536 CSResult.Write ("{0},", value);
538 CSResult.Write ("0x{0:X02},", value);
539 CResult.Write ("{0},", value);
541 binary.Write (value);
543 if ((i & 0xF) == 0xF) {
544 CSResult.WriteLine ("// {0:X04}", i - 0xF);
545 CResult.WriteLine ();
550 for (int i = 0; i < cjk.Length; i++) {
553 byte value = (byte) (cjk [i] & 0xFF);
555 CSResult.Write ("{0},", value);
557 CSResult.Write ("0x{0:X02},", value);
558 CResult.Write ("{0},", value);
560 binary.Write (value);
562 if ((i & 0xF) == 0xF) {
563 CSResult.WriteLine ("// {0:X04}", i - 0xF);
564 CResult.WriteLine ();
568 CResult.WriteLine ("0};");
569 CSResult.WriteLine ("};");
570 CSResult.WriteLine ();
572 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
573 byte [] array = ms.ToArray ();
574 fs.Write (array, 0, array.Length);
579 void SerializeCJK (string name, byte [] cjk, int max)
581 CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
582 CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
584 MemoryStream ms = new MemoryStream ();
585 BinaryWriter binary = new BinaryWriter (ms);
586 binary.Write (UUtil.ResourceVersion);
588 for (int i = 0; i < cjk.Length; i++) {
591 byte value = cjk [i];
593 CSResult.Write ("{0},", value);
595 CSResult.Write ("0x{0:X02},", value);
596 CResult.Write ("{0},", value);
598 binary.Write (value);
600 if ((i & 0xF) == 0xF) {
601 CSResult.WriteLine ("// {0:X04}", i - 0xF);
602 CResult.WriteLine ();
605 CResult.WriteLine ("0};");
606 CSResult.WriteLine ("};");
607 CSResult.WriteLine ();
609 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
610 byte [] array = ms.ToArray ();
611 fs.Write (array, 0, array.Length);
616 void SerializeTailorings ()
618 Hashtable indexes = new Hashtable ();
619 Hashtable counts = new Hashtable ();
620 CResult.WriteLine ("static const guint16*collation_table_tailoring = {");
621 CSResult.WriteLine ("static char [] tailorings = new char [] {");
624 MemoryStream ms = new MemoryStream ();
625 BinaryWriter binary = new BinaryWriter (ms);
626 // Here we don't need to output resource version.
629 foreach (Tailoring t in tailorings) {
632 CResult.Write ("/*{0}*/", t.LCID);
633 CSResult.Write ("/*{0}*/", t.LCID);
634 indexes.Add (t.LCID, count);
635 char [] values = t.ItemToCharArray ();
636 counts.Add (t.LCID, values.Length);
637 foreach (char c in values) {
638 CSResult.Write ("'\\x{0:X}', ", (int) c);
639 CResult.Write ("{0},", (int) c);
640 if (++count % 16 == 0) {
641 CSResult.WriteLine (" // {0:X04}", count - 16);
642 CResult.WriteLine ();
645 binary.Write ((ushort) c);
649 CResult.WriteLine ("0};");
650 CSResult.WriteLine ("};");
652 CResult.WriteLine ("static const int collation_tailoring_count = {0};", tailorings.Count);
653 CResult.WriteLine ("static const int* collation_tailoring_infos = {");
654 CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
656 byte [] rawdata = ms.ToArray ();
657 ms = new MemoryStream ();
658 binary = new BinaryWriter (ms);
659 binary.Write (UUtil.ResourceVersion);
660 binary.Write (tailorings.Count);
662 foreach (Tailoring t in tailorings) {
663 int target = t.Alias != 0 ? t.Alias : t.LCID;
664 if (!indexes.ContainsKey (target)) {
665 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
668 int idx = (int) indexes [target];
669 int cnt = (int) counts [target];
670 bool french = t.FrenchSort;
672 foreach (Tailoring t2 in tailorings)
673 if (t2.LCID == t.LCID)
674 french = t2.FrenchSort;
675 CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
676 CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
678 binary.Write (t.LCID);
681 binary.Write (french);
684 CResult.WriteLine ("0};");
685 CSResult.WriteLine ("};");
687 binary.Write ((byte) 0xFF);
688 binary.Write ((byte) 0xFF);
689 binary.Write (rawdata.Length / 2);
690 binary.Write (rawdata, 0, rawdata.Length);
693 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
694 byte [] array = ms.ToArray ();
695 fs.Write (array, 0, array.Length);
702 void ParseSources (string dirname)
705 dirname + "/UnicodeData.txt";
706 string derivedCoreProps =
707 dirname + "/DerivedCoreProperties.txt";
709 dirname + "/Scripts.txt";
711 dirname + "/CP932.TXT";
713 dirname + "/DerivedAge.txt";
714 string chXML = dirname + "/common/collation/zh.xml";
715 string jaXML = dirname + "/common/collation/ja.xml";
716 string koXML = dirname + "/common/collation/ko.xml";
718 ParseDerivedAge (derivedAge);
722 ParseJISOrder (cp932); // in prior to ParseUnidata()
723 ParseUnidata (unidata);
725 ParseDerivedCoreProperties (derivedCoreProps);
726 ParseScripts (scripts);
727 ParseCJK (chXML, jaXML, koXML);
729 ParseTailorings ("mono-tailoring-source.txt");
732 void ParseTailorings (string filename)
736 using (StreamReader sr = new StreamReader (filename)) {
738 while (sr.Peek () >= 0) {
740 ProcessTailoringLine (ref t,
741 sr.ReadLine ().Trim ());
743 } catch (Exception) {
744 Console.Error.WriteLine ("ERROR at line {0}", line);
750 // For now this is enough.
751 string ParseTailoringSourceValue (string s)
753 StringBuilder sb = new StringBuilder ();
754 for (int i = 0; i < s.Length; i++) {
755 if (i + 5 < s.Length &&
756 s [i] == '\\' && s [i + 1] == 'u') {
759 s.Substring (i + 2, 4),
760 NumberStyles.HexNumber),
767 return sb.ToString ();
770 void ProcessTailoringLine (ref Tailoring t, string s)
772 int idx = s.IndexOf ('#');
774 s = s.Substring (0, idx).Trim ();
775 if (s.Length == 0 || s [0] == '#')
778 idx = s.IndexOf ('=');
781 int.Parse (s.Substring (1, idx - 1)),
782 int.Parse (s.Substring (idx + 1)));
784 t = new Tailoring (int.Parse (s.Substring (1)));
788 if (s.StartsWith ("*FrenchSort")) {
792 string d = "*Diacritical";
793 if (s.StartsWith (d)) {
794 idx = s.IndexOf ("->");
795 t.AddDiacriticalMap (
796 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
797 NumberStyles.HexNumber),
798 byte.Parse (s.Substring (idx + 2).Trim (),
799 NumberStyles.HexNumber));
802 idx = s.IndexOf (':');
804 string source = s.Substring (0, idx).Trim ();
805 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
806 byte [] b = new byte [4];
807 for (int i = 0; i < 4; i++) {
811 b [i] = byte.Parse (l [i],
812 NumberStyles.HexNumber);
814 t.AddSortKeyMap (ParseTailoringSourceValue (source),
817 idx = s.IndexOf ('=');
819 t.AddReplacementMap (
820 ParseTailoringSourceValue (
821 s.Substring (0, idx).Trim ()),
822 ParseTailoringSourceValue (
823 s.Substring (idx + 1).Trim ()));
826 void ParseDerivedAge (string filename)
828 using (StreamReader file =
829 new StreamReader (filename)) {
830 while (file.Peek () >= 0) {
831 string s = file.ReadLine ();
832 int idx = s.IndexOf ('#');
834 s = s.Substring (0, idx);
835 idx = s.IndexOf (';');
839 string cpspec = s.Substring (0, idx);
840 idx = cpspec.IndexOf ("..");
841 NumberStyles nf = NumberStyles.HexNumber |
842 NumberStyles.AllowTrailingWhite;
843 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
844 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
845 string value = s.Substring (cpspec.Length + 1).Trim ();
848 if (cp > char.MaxValue)
851 double v = double.Parse (value);
852 for (int i = cp; i <= cpEnd; i++)
856 unicodeAge [0] = double.MaxValue; // never be supported
859 void ParseUnidata (string filename)
861 ArrayList decompValues = new ArrayList ();
862 using (StreamReader unidata =
863 new StreamReader (filename)) {
864 for (int line = 1; unidata.Peek () >= 0; line++) {
866 ProcessUnidataLine (unidata.ReadLine (), decompValues);
867 } catch (Exception) {
868 Console.Error.WriteLine ("**** At line " + line);
873 this.decompValues = (int [])
874 decompValues.ToArray (typeof (int));
877 char previousLatinTarget = char.MinValue;
878 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
880 void ProcessUnidataLine (string s, ArrayList decompValues)
882 int idx = s.IndexOf ('#');
884 s = s.Substring (0, idx);
885 idx = s.IndexOf (';');
888 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
889 string [] values = s.Substring (idx + 1).Split (';');
892 if (cp > char.MaxValue)
894 if (IsIgnorable (cp))
897 string name = values [0];
899 // SPECIAL CASE: rename some characters for diacritical
900 // remapping. FIXME: why are they different?
901 // FIXME: it's still not working.
902 if (cp == 0x018B || cp == 0x018C)
903 name = name.Replace ("TOPBAR", "STROKE");
906 if (s.IndexOf ("SMALL CAPITAL") > 0)
907 isSmallCapital [cp] = true;
909 // latin mapping by character name
910 if (s.IndexOf ("LATIN") >= 0) {
911 int lidx = s.IndexOf ("LETTER DOTLESS ");
912 int offset = lidx + 15;
914 lidx = s.IndexOf ("LETTER TURNED ");
918 lidx = s.IndexOf ("LETTER CAPITAL ");
922 lidx = s.IndexOf ("LETTER SCRIPT ");
926 lidx = s.IndexOf ("LETTER ");
929 char c = lidx > 0 ? s [offset] : char.MinValue;
930 char n = s [offset + 1];
931 char target = char.MinValue;
932 if ('A' <= c && c <= 'Z' &&
933 (n == ' ') || n == ';') {
935 // FIXME: After 'Z', I cannot reset this state.
936 previousLatinTarget = c == 'Z' ? char.MinValue : c;
939 if (s.Substring (offset).StartsWith ("ALPHA"))
941 else if (s.Substring (offset).StartsWith ("TONE SIX"))
943 else if (s.Substring (offset).StartsWith ("OPEN O"))
945 else if (s.Substring (offset).StartsWith ("ETH"))
947 else if (s.Substring (offset).StartsWith ("SCHWA"))
949 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
951 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
953 else if (s.Substring (offset).StartsWith ("TONE TWO"))
955 else if (s.Substring (offset).StartsWith ("ESH"))
957 else if (s.Substring (offset).StartsWith ("OUNCE"))
960 // For remaining IPA chars, direct mapping is
963 case 0x0166: case 0x0167:
964 // Though they are 'T', they have different weight
965 target = char.MinValue; break;
966 case 0x0299: target = 'B'; break;
967 case 0x029A: target = 'E'; break;
968 case 0x029B: target = 'G'; break;
969 case 0x029C: target = 'H'; break;
970 case 0x029D: target = 'J'; break;
971 case 0x029E: target = 'K'; break;
972 case 0x029F: target = 'L'; break;
973 case 0x02A0: target = 'Q'; break;
974 case 0x02A7: target = 'T'; break;
975 case 0x02A8: target = 'T'; break;
978 if (target == char.MinValue)
979 target = previousLatinTarget;
981 if (target != char.MinValue) {
982 ArrayList entry = (ArrayList) latinMap [target];
984 entry = new ArrayList ();
985 latinMap [target] = entry;
988 // FIXME: This secondary weight is hack.
989 // They are here because they must not
990 // be identical to the corresponding
992 if (c != target && diacritical [cp] == 0) {
993 diacriticalOffset [c - 'A']++;
994 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
1000 if (0x2000 <= cp && cp < 0x3000) {
1002 // SPECIAL CASES. FIXME: why?
1004 case 0x21C5: value = -1; break; // E2
1005 case 0x261D: value = 1; break;
1006 case 0x27A6: value = 3; break;
1007 case 0x21B0: value = 7; break;
1008 case 0x21B1: value = 3; break;
1009 case 0x21B2: value = 7; break;
1010 case 0x21B4: value = 5; break;
1011 case 0x21B5: value = 7; break;
1012 case 0x21B9: value = -1; break; // E1
1013 case 0x21CF: value = 7; break;
1014 case 0x21D0: value = 3; break;
1016 string [] arrowTargets = new string [] {
1029 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
1030 s.IndexOf ("LEFTWARDS") >= 0)
1031 value = 0xE1 - 0xD8;
1032 else if (s.IndexOf ("UPWARDS") >= 0 &&
1033 s.IndexOf ("DOWNWARDS") >= 0)
1034 value = 0xE2 - 0xD8;
1035 else if (s.IndexOf ("ARROW") >= 0 &&
1036 s.IndexOf ("COMBINING") < 0 &&
1037 s.IndexOf ("CLOCKWISE") >= 0)
1038 value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
1040 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
1041 if (s.IndexOf (arrowTargets [i]) > 0 &&
1042 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
1043 s.IndexOf (" OVER") < 0
1047 arrowValues.Add (new DictionaryEntry (
1052 if (0x2500 <= cp && cp < 0x2600) {
1053 int value = int.MinValue;
1055 // up:1 down:2 right:4 left:8 vert:16 horiz:32
1058 // [dr] [dl] [ur] [ul]
1059 // [vr,udr] [vl,vdl]
1060 // [hd,rld] [hu,rlu]
1061 // [hv,udrl,rlv,udh]
1062 ArrayList flags = new ArrayList (new int [] {
1065 4 + 2, 8 + 2, 4 + 1, 8 + 1,
1066 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
1067 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
1068 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
1070 byte [] offsets = new byte [] {
1077 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
1079 if (s.IndexOf (" UP") >= 0)
1081 if (s.IndexOf (" DOWN") >= 0)
1083 if (s.IndexOf (" RIGHT") >= 0)
1085 if (s.IndexOf (" LEFT") >= 0)
1087 if (s.IndexOf (" VERTICAL") >= 0)
1089 if (s.IndexOf (" HORIZONTAL") >= 0)
1092 int fidx = flags.IndexOf (flag);
1094 value = offsets [fidx];
1095 } else if (s.IndexOf ("BLOCK") >= 0) {
1096 if (s.IndexOf ("ONE EIGHTH") >= 0)
1098 else if (s.IndexOf ("ONE QUARTER") >= 0)
1100 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1102 else if (s.IndexOf ("HALF") >= 0)
1104 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1106 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1108 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1113 else if (s.IndexOf ("SHADE") >= 0)
1115 else if (s.IndexOf ("SQUARE") >= 0)
1116 value = 0xBC - 0xE5;
1117 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1118 value = 0xBE - 0xE5;
1119 else if (s.IndexOf ("RECTANGLE") >= 0)
1120 value = 0xBD - 0xE5;
1121 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1122 value = 0xBF - 0xE5;
1123 else if (s.IndexOf ("TRIANGLE") >= 0) {
1124 if (s.IndexOf ("UP-POINTING") >= 0)
1125 value = 0xC0 - 0xE5;
1126 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1127 value = 0xC1 - 0xE5;
1128 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1129 value = 0xC2 - 0xE5;
1130 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1131 value = 0xC3 - 0xE5;
1133 else if (s.IndexOf ("POINTER") >= 0) {
1134 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1135 value = 0xC4 - 0xE5;
1136 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1137 value = 0xC5 - 0xE5;
1139 else if (s.IndexOf ("DIAMOND") >= 0)
1140 value = 0xC6 - 0xE5;
1141 else if (s.IndexOf ("FISHEYE") >= 0)
1142 value = 0xC7 - 0xE5;
1143 else if (s.IndexOf ("LOZENGE") >= 0)
1144 value = 0xC8 - 0xE5;
1145 else if (s.IndexOf ("BULLSEYE") >= 0)
1146 value = 0xC9 - 0xE5;
1147 else if (s.IndexOf ("CIRCLE") >= 0) {
1148 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1149 value = 0xCA - 0xE5;
1150 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1151 value = 0xCB - 0xE5;
1153 value = 0xC9 - 0xE5;
1155 else if (s.IndexOf ("BULLET") >= 0)
1156 value = 0xCC - 0xE5;
1157 if (0x25DA <= cp && cp <= 0x25E5)
1158 value = 0xCD + cp - 0x25DA - 0xE5;
1160 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1162 case 0x2571: value = 0xF; break;
1163 case 0x2572: value = 0x10; break;
1164 case 0x2573: value = 0x11; break;
1166 if (value != int.MinValue)
1167 boxValues.Add (new DictionaryEntry (
1171 // For some characters store the name and sort later
1172 // to determine sorting.
1173 if (0x2100 <= cp && cp <= 0x213F &&
1174 Char.IsSymbol ((char) cp))
1175 sortableCharNames.Add (
1176 new DictionaryEntry (cp, name));
1177 else if (0x3380 <= cp && cp <= 0x33DD)
1178 sortableCharNames.Add (new DictionaryEntry (
1179 cp, name.Substring (7)));
1181 if (Char.GetUnicodeCategory ((char) cp) ==
1182 UnicodeCategory.MathSymbol) {
1183 if (name.StartsWith ("CIRCLED "))
1184 diacritical [cp] = 0xEE;
1185 if (name.StartsWith ("SQUARED "))
1186 diacritical [cp] = 0xEF;
1189 // diacritical weights by character name
1190 if (diacritics.Length != diacriticWeights.Length)
1191 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1192 for (int d = diacritics.Length - 1; d >= 0; d--) {
1193 if (s.IndexOf (diacritics [d]) > 0) {
1194 diacritical [cp] += diacriticWeights [d];
1195 if (s.IndexOf ("COMBINING") >= 0)
1196 diacritical [cp] -= (byte) 2;
1199 // also process "COMBINING blah" here
1200 // For now it is limited to cp < 0x0370
1201 // if (cp < 0x0300 || cp >= 0x0370)
1203 string tmp = diacritics [d].TrimEnd (';');
1204 if (tmp.IndexOf ("WITH ") == 0)
1205 tmp = tmp.Substring (4);
1206 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1208 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1212 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1214 // Two-step grep required for it.
1215 if (s.IndexOf ("FULL STOP") > 0 &&
1216 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1217 diacritical [cp] |= 0xF4;
1218 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1219 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1220 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1222 // Arabic letter name
1223 if (0x0621 <= cp && cp <= 0x064A &&
1224 Char.GetUnicodeCategory ((char) cp)
1225 == UnicodeCategory.OtherLetter) {
1226 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1231 // hamza, waw, yeh ... special cases.
1236 value = 0x77; // special cases.
1239 // Get primary letter name i.e.
1240 // XXX part of ARABIC LETTER XXX yyy
1241 // e.g. that of "TEH MARBUTA" is "TEH".
1244 // 0x0640 is special: it does
1245 // not start with ARABIC LETTER
1247 name.Substring (14);
1248 int tmpIdx = letterName.IndexOf (' ');
1249 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1250 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1251 if (arabicNameMap.ContainsKey (letterName))
1252 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1254 arabicNameMap [letterName] = cp;
1257 arabicLetterPrimaryValues [cp] = value;
1260 // Japanese square letter
1261 if (0x3300 <= cp && cp <= 0x3357)
1262 if (!ExistsJIS (cp))
1263 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1265 // normalizationType
1266 string decomp = values [4];
1267 idx = decomp.IndexOf ('<');
1269 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1271 decompType [cp] = DecompositionFull;
1274 decompType [cp] = DecompositionSub;
1277 decompType [cp] = DecompositionSuper;
1280 decompType [cp] = DecompositionSmall;
1283 decompType [cp] = DecompositionIsolated;
1286 decompType [cp] = DecompositionInitial;
1289 decompType [cp] = DecompositionFinal;
1292 decompType [cp] = DecompositionMedial;
1295 decompType [cp] = DecompositionNoBreak;
1298 decompType [cp] = DecompositionCompat;
1301 decompType [cp] = DecompositionFraction;
1304 decompType [cp] = DecompositionFont;
1307 decompType [cp] = DecompositionCircle;
1310 decompType [cp] = DecompositionSquare;
1313 decompType [cp] = DecompositionWide;
1316 decompType [cp] = DecompositionNarrow;
1319 decompType [cp] = DecompositionVertical;
1322 throw new Exception ("Support NFKD type : " + decomp);
1326 decompType [cp] = DecompositionCanonical;
1327 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1328 if (decomp.Length > 0) {
1330 string [] velems = decomp.Split (' ');
1331 int didx = decompValues.Count;
1332 decompIndex [cp] = didx;
1333 foreach (string v in velems)
1334 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1335 decompLength [cp] = velems.Length;
1337 // [decmpType] -> this_cp
1338 int targetCP = (int) decompValues [didx];
1339 // for "(x)" it specially maps to 'x' .
1340 // FIXME: check if it is sane
1341 if (velems.Length == 3 &&
1342 (int) decompValues [didx] == '(' &&
1343 (int) decompValues [didx + 2] == ')')
1344 targetCP = (int) decompValues [didx + 1];
1345 // special: 0x215F "1/"
1346 else if (cp == 0x215F)
1348 else if (velems.Length > 1 &&
1349 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1350 // skip them, except for CJK ideograph compat
1353 if (targetCP != 0) {
1354 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1355 if (entry == null) {
1356 entry = new Hashtable ();
1357 nfkdMap [targetCP] = entry;
1359 entry [(byte) decompType [cp]] = cp;
1363 if (values [5].Length > 0)
1364 decimalValue [cp] = decimal.Parse (values [5]);
1365 else if (values [6].Length > 0)
1366 decimalValue [cp] = decimal.Parse (values [6]);
1367 else if (values [7].Length > 0) {
1368 string decstr = values [7];
1369 idx = decstr.IndexOf ('/');
1370 if (cp == 0x215F) // special. "1/"
1371 decimalValue [cp] = 0x1;
1375 decimal.Parse (decstr.Substring (0, idx))
1376 / decimal.Parse (decstr.Substring (idx + 1));
1377 else if (decstr [0] == '(' &&
1378 decstr [decstr.Length - 1] == ')')
1381 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1382 else if (decstr [decstr.Length - 1] == '.')
1385 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1387 decimalValue [cp] = decimal.Parse (decstr);
1391 void ParseDerivedCoreProperties (string filename)
1394 using (StreamReader file =
1395 new StreamReader (filename)) {
1396 for (int line = 1; file.Peek () >= 0; line++) {
1398 ProcessDerivedCorePropLine (file.ReadLine ());
1399 } catch (Exception) {
1400 Console.Error.WriteLine ("**** At line " + line);
1407 void ProcessDerivedCorePropLine (string s)
1409 int idx = s.IndexOf ('#');
1411 s = s.Substring (0, idx);
1412 idx = s.IndexOf (';');
1415 string cpspec = s.Substring (0, idx);
1416 idx = cpspec.IndexOf ("..");
1417 NumberStyles nf = NumberStyles.HexNumber |
1418 NumberStyles.AllowTrailingWhite;
1419 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1420 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1421 string value = s.Substring (cpspec.Length + 1).Trim ();
1424 if (cp > char.MaxValue)
1429 for (int x = cp; x <= cpEnd; x++)
1430 isUppercase [x] = true;
1435 void ParseScripts (string filename)
1437 ArrayList gurmukhi = new ArrayList ();
1438 ArrayList gujarati = new ArrayList ();
1439 ArrayList georgian = new ArrayList ();
1440 ArrayList thaana = new ArrayList ();
1442 using (StreamReader file =
1443 new StreamReader (filename)) {
1444 while (file.Peek () >= 0) {
1445 string s = file.ReadLine ();
1446 int idx = s.IndexOf ('#');
1448 s = s.Substring (0, idx);
1449 idx = s.IndexOf (';');
1453 string cpspec = s.Substring (0, idx);
1454 idx = cpspec.IndexOf ("..");
1455 NumberStyles nf = NumberStyles.HexNumber |
1456 NumberStyles.AllowTrailingWhite;
1457 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1458 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1459 string value = s.Substring (cpspec.Length + 1).Trim ();
1462 if (cp > char.MaxValue)
1467 for (int x = cp; x <= cpEnd; x++)
1468 if (!IsIgnorable (x))
1469 gurmukhi.Add ((char) x);
1472 for (int x = cp; x <= cpEnd; x++)
1473 if (!IsIgnorable (x))
1474 gujarati.Add ((char) x);
1477 for (int x = cp; x <= cpEnd; x++)
1478 if (!IsIgnorable (x))
1479 georgian.Add ((char) x);
1482 for (int x = cp; x <= cpEnd; x++)
1483 if (!IsIgnorable (x))
1484 thaana.Add ((char) x);
1489 gurmukhi.Sort (UCAComparer.Instance);
1490 gujarati.Sort (UCAComparer.Instance);
1491 georgian.Sort (UCAComparer.Instance);
1492 thaana.Sort (UCAComparer.Instance);
1493 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1494 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1495 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1496 orderedThaana = (char []) thaana.ToArray (typeof (char));
1499 void ParseJISOrder (string filename)
1503 using (StreamReader file =
1504 new StreamReader (filename)) {
1505 for (;file.Peek () >= 0; line++)
1506 ProcessJISOrderLine (file.ReadLine ());
1508 } catch (Exception) {
1509 Console.Error.WriteLine ("---- line {0}", line);
1514 char [] ws = new char [] {'\t', ' '};
1516 void ProcessJISOrderLine (string s)
1518 int idx = s.IndexOf ('#');
1520 s = s.Substring (0, idx).Trim ();
1523 idx = s.IndexOfAny (ws);
1526 // They start with "0x" so cut them out.
1527 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1528 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1529 jisJapanese.Add (new JISCharacter (cp, jis));
1532 void ParseCJK (string zhXML, string jaXML, string koXML)
1534 XmlDocument doc = new XmlDocument ();
1535 doc.XmlResolver = null;
1542 // Chinese Simplified
1545 offset = 0;//char.MaxValue - arr.Length;
1547 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1549 foreach (char c in s) {
1551 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1553 arr [(int) c - offset] = (ushort) v++;
1559 // Chinese Traditional
1562 offset = 0;//char.MaxValue - arr.Length;
1563 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1565 foreach (char c in s) {
1567 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1569 arr [(int) c - offset] = (ushort) v++;
1578 offset = 0;//char.MaxValue - arr.Length;
1581 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1582 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1583 arr [0x337E] = 0x8005;
1584 arr [0x337D] = 0x8006;
1585 arr [0x337C] = 0x8007;
1588 foreach (JISCharacter jc in jisJapanese) {
1589 if (jc.JIS < 0x8800)
1591 char c = (char) jc.CP;
1594 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1597 arr [(int) c - offset] = (ushort) v++;
1602 if (c == '\u662D') // U+337C
1604 if (c == '\u5927') // U+337D
1606 if (c == '\u5E73') // U+337B
1608 if (c == '\u660E') // U+337E
1610 if (c == '\u9686') // U+F9DC
1613 // FIXME: there are still remaining
1614 // characters after U+FA0C.
1615 // for (int k = 0; k < char.MaxValue; k++) {
1616 for (int k = 0; k < '\uFA0D'; k++) {
1617 if (decompIndex [k] == 0 || IsIgnorable (k))
1619 if (decompValues [decompIndex [k]] == c /*&&
1620 decompLength [k] == 1*/ ||
1621 decompLength [k] == 3 &&
1622 decompValues [decompIndex [k] + 1] == c) {
1623 arr [k - offset] = (ushort) v++;
1632 // Korean weight is somewhat complex. It first shifts
1633 // Hangul category from 52-x to 80-x (they are anyways
1634 // computed). CJK ideographs are placed at secondary
1635 // weight, like XX YY 01 zz 01, where XX and YY are
1636 // corresponding "reset" value and zz is 41,43,45...
1638 // Unlike chs,cht and ja, Korean value is a combined
1639 // ushort which is computed as category
1643 offset = 0;//char.MaxValue - arr.Length;
1645 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1646 XmlElement sc = (XmlElement) reset.NextSibling;
1647 // compute "category" and "level 1" for the
1648 // target "reset" Hangle syllable
1649 char rc = reset.InnerText [0];
1650 int ri = ((int) rc - 0xAC00) + 1;
1652 ((ri / 254) * 256 + (ri % 254) + 2);
1653 // Place the characters after the target.
1656 foreach (char c in s) {
1657 arr [(int) c - offset] = p;
1658 cjkKOlv2 [(int) c - offset] = (byte) v;
1668 void FillIgnorables ()
1670 for (int i = 0; i <= char.MaxValue; i++) {
1671 if (Char.GetUnicodeCategory ((char) i) ==
1672 UnicodeCategory.OtherNotAssigned)
1674 if (IsIgnorable (i))
1675 ignorableFlags [i] |= 1;
1676 if (IsIgnorableSymbol (i))
1677 ignorableFlags [i] |= 2;
1678 if (IsIgnorableNonSpacing (i))
1679 ignorableFlags [i] |= 4;
1683 void ModifyUnidata ()
1685 ArrayList decompValues = new ArrayList (this.decompValues);
1687 // Hebrew uppercase letters.
1688 foreach (int i in new int []
1689 {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
1690 isUppercase [i] = true;
1693 // Modify some decomposition equivalence
1694 for (int i = 0xFE31; i <= 0xFE34; i++) {
1696 decompIndex [i] = 0;
1697 decompLength [i] = 0;
1699 decompType [0x037E] = 0;
1700 decompIndex [0x037E] = 0;
1701 decompLength [0x037E] = 0;
1704 for (int i = 0x3021; i <= 0x3029; i++)
1705 diacritical [i] = 0x4E;
1706 // Korean parens numbers
1707 for (int i = 0x3200; i <= 0x321C; i++)
1708 diacritical [i] = 0xA;
1709 for (int i = 0x3260; i <= 0x327B; i++)
1710 diacritical [i] = 0xC;
1712 // LAMESPEC: these remapping should not be done.
1713 // Windows have incorrect CJK compat mappings.
1714 decompValues [decompIndex [0x32A9]] = 0x91AB;
1715 decompLength [0x323B] = 1;
1716 decompValues [decompIndex [0x323B]] = 0x5B78;
1717 decompValues [decompIndex [0x32AB]] = 0x5B78;
1718 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1719 decompLength [0x3238] = 1;
1720 decompValues [decompIndex [0x3238]] = 0x52DE;
1721 decompValues [decompIndex [0x3298]] = 0x52DE;
1723 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1724 decompIndex [0xFA0C] = decompValues.Count;
1725 decompValues.Add ((int) 0x5140);
1726 decompLength [0xFA0C] = 1;
1727 decompIndex [0xF929] = decompLength [0xF929] = 0;
1729 decompValues [decompIndex [0xF92C]] = 0x90DE;
1731 decompIndex [0x2125] = decompValues.Count;
1732 decompValues.Add ((int) 0x005A);
1733 decompLength [0x2125] = 1;
1734 decompType [0x2125] = DecompositionFont;
1736 this.decompValues = decompValues.ToArray (typeof (int)) as int [];
1739 void ModifyParsedValues ()
1741 // Sometimes STROKE don't work fine
1742 diacritical [0xD8] = diacritical [0xF8] = 0x21;
1743 diacritical [0x141] = diacritical [0x142] = 0x1F;
1745 diacritical [0xAA] = diacritical [0xBA] = 3;
1746 diacritical [0xD0] = diacritical [0xF0] = 0x68;
1747 diacritical [0x131] = 3;
1748 diacritical [0x138] = 3;
1749 // TOPBAR does not work as an identifier for the weight
1750 diacritical [0x182] = diacritical [0x183] = 0x68; // B
1751 diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
1753 diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
1755 diacritical [0x184] = diacritical [0x185] = 0x87;
1757 diacritical [0x190] = diacritical [0x25B] = 0x7B;
1758 // There are many letters w/ diacritical weight 0x7B
1759 diacritical [0x0192] = diacritical [0x0194] =
1760 diacritical [0x0195] = diacritical [0x0196] =
1761 diacritical [0x019C] = diacritical [0x019E] =
1762 diacritical [0x01A6] = diacritical [0x01B1] =
1763 diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
1764 // ... as well as 0x7C
1765 diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
1767 // <font> NFKD characters seem to have diacritical
1768 // weight as 3,4,5... but the order does not look
1769 // by codepoint and I have no idea how they are sorted.
1770 diacritical [0x210E] = 3;
1771 diacritical [0x210F] = 0x68;
1772 diacritical [0x2110] = 4;
1773 diacritical [0x2111] = 5;
1774 diacritical [0x2112] = 4;
1775 diacritical [0x2113] = 4;
1776 diacritical [0x211B] = 4;
1777 diacritical [0x211C] = 5;
1779 // some cyrillic diacritical weight. They seem to be
1780 // based on old character names, so it's quicker to
1781 // set them directly here.
1782 // FIXME: they are by mostly unknown reason
1783 diacritical [0x0496] = diacritical [0x0497] = 7;
1784 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1785 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1786 diacritical [0x049C] = diacritical [0x049D] = 9;
1787 diacritical [0x049E] = diacritical [0x049F] = 4;
1788 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1789 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1790 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1791 diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
1792 diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
1793 diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
1794 diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
1795 diacritical [0x04B4] = diacritical [0x04B5] = 3;
1796 diacritical [0x04B6] = 8;
1797 diacritical [0x04B7] = 7;
1798 diacritical [0x04B8] = diacritical [0x04B9] = 9;
1799 diacritical [0x04BA] = diacritical [0x04BB] = 9;
1801 // number, secondary weights
1803 int [] numarr = numberSecondaryWeightBounds;
1804 for (int i = 0; i < numarr.Length; i += 2, weight++)
1805 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1806 if (Char.IsNumber ((char) cp))
1807 diacritical [cp] = weight;
1809 // Gurmukhi special letters' diacritical weight
1810 for (int i = 0x0A50; i < 0x0A60; i++)
1811 diacritical [i] = 4;
1812 // Oriya special letters' diacritical weight
1813 for (int i = 0x0B5C; i < 0x0B60; i++)
1814 diacritical [i] = 6;
1816 // Update name part of named characters
1817 for (int i = 0; i < sortableCharNames.Count; i++) {
1818 DictionaryEntry de =
1819 (DictionaryEntry) sortableCharNames [i];
1820 int cp = (int) de.Key;
1821 string renamed = null;
1823 case 0x2101: renamed = "A_1"; break;
1824 case 0x33C3: renamed = "A_2"; break;
1825 case 0x2105: renamed = "C_1"; break;
1826 case 0x2106: renamed = "C_2"; break;
1827 case 0x211E: renamed = "R1"; break;
1828 case 0x211F: renamed = "R2"; break;
1829 // Remove some of them!
1840 sortableCharNames.RemoveAt (i);
1844 if (renamed != null)
1845 sortableCharNames [i] =
1846 new DictionaryEntry (cp, renamed);
1850 void GenerateCore ()
1854 #region Specially ignored // 01
1855 // This will raise "Defined" flag up.
1856 // FIXME: Check If it is really fine. Actually for
1857 // Japanese voice marks this code does remapping.
1858 foreach (char c in specialIgnore)
1859 map [(int) c] = new CharMapEntry (0, 0, 0);
1862 #region Extenders (FF FF)
1863 fillIndex [0xFF] = 0xFF;
1864 char [] specialBiggest = new char [] {
1865 '\u3005', '\u3031', '\u3032', '\u309D',
1866 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1867 '\uFE7C', '\uFE7D', '\uFF70'};
1868 foreach (char c in specialBiggest)
1869 AddCharMap (c, 0xFF, 0);
1872 #region Variable weights
1873 // Controls : 06 03 - 06 3D
1874 fillIndex [0x6] = 3;
1875 for (int i = 0; i < 65536; i++) {
1876 if (IsIgnorable (i))
1879 uc = Char.GetUnicodeCategory (c);
1880 // NEL is whitespace but not ignored here.
1881 if (uc == UnicodeCategory.Control &&
1882 !Char.IsWhiteSpace (c) || c == '\u0085')
1883 AddCharMap (c, 6, 1);
1887 fillIndex [0x6] = 0x80;
1888 AddCharMap ('\'', 6, 0);
1889 AddCharMap ('\uFF07', 6, 1);
1890 AddCharMap ('\uFE63', 6, 1);
1892 // SPECIAL CASE: fill FE32 here in prior to be added
1893 // at 2013. Windows does not always respect NFKD.
1894 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1896 // Hyphen/Dash : 06 81 - 06 90
1897 for (int i = 0; i < char.MaxValue; i++) {
1898 if (!IsIgnorable (i) &&
1899 Char.GetUnicodeCategory ((char) i) ==
1900 UnicodeCategory.DashPunctuation) {
1901 AddCharMapGroup2 ((char) i, 6, 1, 0);
1903 // SPECIAL: add 2027 and 2043
1904 // Maybe they are regarded the
1905 // same hyphens in "central"
1907 AddCharMap ('\u2027', 6, 1);
1908 AddCharMap ('\u2043', 6, 1);
1912 // They are regarded as primarily equivalent to '-'
1913 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1914 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1915 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1917 // Arabic variable weight chars 06 A0 -
1918 fillIndex [6] = 0xA0;
1920 for (int i = 0x64B; i <= 0x650; i++)
1921 AddArabicCharMap ((char) i, 6, 1, 0);
1923 AddCharMapGroup ('\u0652', 6, 1, 0);
1925 AddCharMapGroup ('\u0651', 6, 1, 0);
1929 #region Nonspacing marks // 01
1930 // FIXME: 01 03 - 01 B6 ... annoyance :(
1932 // Combining diacritical marks: 01 DC -
1934 fillIndex [0x1] = 0x41;
1935 for (int i = 0x030E; i <= 0x0326; i++)
1936 if (!IsIgnorable (i))
1937 AddCharMap ((char) i, 0x1, 1);
1938 for (int i = 0x0329; i <= 0x0334; i++)
1939 if (!IsIgnorable (i))
1940 AddCharMap ((char) i, 0x1, 1);
1942 for (int i = 0x0339; i <= 0x0341; i++)
1943 if (!IsIgnorable (i))
1944 AddCharMap ((char) i, 0x1, 1);
1945 fillIndex [0x1] = 0x74;
1946 for (int i = 0x0346; i <= 0x0348; i++)
1947 if (!IsIgnorable (i))
1948 AddCharMap ((char) i, 0x1, 1);
1949 for (int i = 0x02BE; i <= 0x02BF; i++)
1950 if (!IsIgnorable (i))
1951 AddCharMap ((char) i, 0x1, 1);
1952 for (int i = 0x02C1; i <= 0x02C5; i++)
1953 if (!IsIgnorable (i))
1954 AddCharMap ((char) i, 0x1, 1);
1955 for (int i = 0x02CE; i <= 0x02CF; i++)
1956 if (!IsIgnorable (i))
1957 AddCharMap ((char) i, 0x1, 1);
1959 for (int i = 0x02D1; i <= 0x02D3; i++)
1960 if (!IsIgnorable (i))
1961 AddCharMap ((char) i, 0x1, 1);
1962 AddCharMap ('\u02DE', 0x1, 1);
1963 for (int i = 0x02E4; i <= 0x02E9; i++)
1964 if (!IsIgnorable (i))
1965 AddCharMap ((char) i, 0x1, 1);
1968 // FIXME: needs more love here (it should eliminate
1969 // all the hacky code above).
1970 for (int i = 0x0300; i < 0x0370; i++)
1971 if (!IsIgnorable (i) && diacritical [i] != 0
1972 && !map [i].Defined)
1973 map [i] = new CharMapEntry (
1974 0x1, 0x1, diacritical [i]);
1976 // Cyrillic and Armenian nonspacing mark
1977 fillIndex [0x1] = 0x94;
1978 for (int i = 0x400; i < 0x580; i++)
1979 if (!IsIgnorable (i) &&
1980 Char.GetUnicodeCategory ((char) i) ==
1981 UnicodeCategory.NonSpacingMark)
1982 AddCharMap ((char) i, 1, 1);
1984 fillIndex [0x1] = 0x8D;
1985 // syriac dotted nonspacing marks (1)
1986 AddCharMap ('\u0740', 0x1, 1);
1987 AddCharMap ('\u0741', 0x1, 1);
1988 AddCharMap ('\u0742', 0x1, 1);
1989 // syriac oblique nonspacing marks
1990 AddCharMap ('\u0747', 0x1, 1);
1991 AddCharMap ('\u0748', 0x1, 1);
1992 // syriac dotted nonspacing marks (2)
1993 fillIndex [0x1] = 0x94; // this reset is mandatory
1994 AddCharMap ('\u0732', 0x1, 1);
1995 AddCharMap ('\u0735', 0x1, 1);
1996 AddCharMap ('\u0738', 0x1, 1);
1997 AddCharMap ('\u0739', 0x1, 1);
1998 AddCharMap ('\u073C', 0x1, 1);
1999 // SPECIAL CASES: superscripts
2000 AddCharMap ('\u073F', 0x1, 1);
2001 AddCharMap ('\u0711', 0x1, 1);
2003 for (int i = 0x0743; i <= 0x0746; i++)
2004 AddCharMap ((char) i, 0x1, 1);
2005 for (int i = 0x0730; i <= 0x0780; i++)
2006 if (!map [i].Defined &&
2007 Char.GetUnicodeCategory ((char) i) ==
2008 UnicodeCategory.NonSpacingMark)
2009 AddCharMap ((char) i, 0x1, 1);
2011 // LAMESPEC: It should not stop at '\u20E1'. There are
2012 // a few more characters (that however results in
2013 // overflow of level 2 unless we start before 0xDD).
2014 fillIndex [0x1] = 0xDD;
2015 for (int i = 0x20D0; i <= 0x20DC; i++)
2016 AddCharMap ((char) i, 0x1, 1);
2017 fillIndex [0x1] = 0xEC;
2018 for (int i = 0x20DD; i <= 0x20E1; i++)
2019 AddCharMap ((char) i, 0x1, 1);
2020 fillIndex [0x1] = 0x4;
2021 AddCharMap ('\u0CD5', 0x1, 1);
2022 AddCharMap ('\u0CD6', 0x1, 1);
2023 AddCharMap ('\u093C', 0x1, 1);
2024 for (int i = 0x302A; i <= 0x302D; i++)
2025 AddCharMap ((char) i, 0x1, 1);
2026 AddCharMap ('\u0C55', 0x1, 1);
2027 AddCharMap ('\u0C56', 0x1, 1);
2029 fillIndex [0x1] = 0x50; // I wonder how they are sorted
2030 for (int i = 0x02D4; i <= 0x02D7; i++)
2031 AddCharMap ((char) i, 0x1, 1);
2033 // They are not part of Nonspacing marks, but have
2034 // only diacritical weight.
2035 for (int i = 0x3099; i <= 0x309C; i++)
2036 map [i] = new CharMapEntry (1, 1, 1);
2037 map [0xFF9E] = new CharMapEntry (1, 1, 1);
2038 map [0xFF9F] = new CharMapEntry (1, 1, 2);
2039 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
2040 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
2041 for (int i = 0x30FC; i <= 0x30FE; i++)
2042 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
2044 fillIndex [0x1] = 0xA;
2045 for (int i = 0x0951; i <= 0x0954; i++)
2046 AddCharMap ((char) i, 0x1, 2);
2051 #region Whitespaces // 07 03 -
2052 fillIndex [0x7] = 0x2;
2053 AddCharMap (' ', 0x7, 2);
2054 AddCharMap ('\u00A0', 0x7, 1);
2055 for (int i = 9; i <= 0xD; i++)
2056 AddCharMap ((char) i, 0x7, 1);
2057 for (int i = 0x2000; i <= 0x200B; i++)
2058 AddCharMap ((char) i, 0x7, 1);
2060 fillIndex [0x7] = 0x17;
2061 AddCharMapGroup ('\u2028', 0x7, 1, 0);
2062 AddCharMapGroup ('\u2029', 0x7, 1, 0);
2064 // Characters which used to represent layout control.
2065 // LAMESPEC: Windows developers seem to have thought
2066 // that those characters are kind of whitespaces,
2067 // while they aren't.
2068 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
2069 AddCharMap ('\u2423', 0x7, 1, 0); // open box
2073 // category 09 - continued symbols from 08
2074 fillIndex [0x9] = 2;
2076 for (int cp = 0x2300; cp <= 0x237A; cp++)
2077 AddCharMap ((char) cp, 0x9, 1, 0);
2080 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
2081 foreach (DictionaryEntry de in arrowValues) {
2082 int idx = (int) de.Value;
2083 int cp = (int) de.Key;
2084 if (map [cp].Defined)
2086 fillIndex [0x9] = (byte) (0xD8 + idx);
2087 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
2091 byte [] boxLv2 = new byte [128];
2092 // 0-63 will be used for those offsets are positive,
2093 // and 64-127 are for negative ones.
2094 for (int i = 0; i < boxLv2.Length; i++)
2096 foreach (DictionaryEntry de in boxValues) {
2097 int cp = (int) de.Key;
2098 int off = (int) de.Value;
2099 if (map [cp].Defined)
2102 fillIndex [0x9] = (byte) (0xE5 + off);
2103 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
2106 fillIndex [0x9] = (byte) (0xE5 + off);
2107 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
2110 // Some special characters (slanted)
2111 fillIndex [0x9] = 0xF4;
2112 AddCharMap ('\u2571', 0x9, 3);
2113 AddCharMap ('\u2572', 0x9, 3);
2114 AddCharMap ('\u2573', 0x9, 3);
2116 // FIXME: implement 0A
2118 fillIndex [0xA] = 2;
2119 // byte currency symbols
2120 for (int cp = 0; cp < 0x100; cp++) {
2121 uc = Char.GetUnicodeCategory ((char) cp);
2122 if (!IsIgnorable (cp) &&
2123 uc == UnicodeCategory.CurrencySymbol &&
2125 AddCharMapGroup ((char) cp, 0xA, 1, 0);
2127 // byte other symbols
2128 for (int cp = 0; cp < 0x100; cp++) {
2130 continue; // SPECIAL: skip FIXME: why?
2131 uc = Char.GetUnicodeCategory ((char) cp);
2132 if (!IsIgnorable (cp) &&
2133 uc == UnicodeCategory.OtherSymbol ||
2134 cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
2135 AddCharMapGroup ((char) cp, 0xA, 1, 0);
2138 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
2140 for (int cp = 0x2020; cp <= 0x2031; cp++)
2141 if (Char.IsPunctuation ((char) cp))
2142 AddCharMap ((char) cp, 0xA, 1, 0);
2143 // SPECIAL CASES: why?
2144 AddCharMap ('\u203B', 0xA, 1, 0);
2145 AddCharMap ('\u2040', 0xA, 1, 0);
2146 AddCharMap ('\u2041', 0xA, 1, 0);
2147 AddCharMap ('\u2042', 0xA, 1, 0);
2149 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
2150 AddCharMap ((char) cp, 0xA, 1, 0);
2152 // 3004 is skipped at first...
2153 for (int cp = 0x3010; cp <= 0x3040; cp++)
2154 if (Char.IsSymbol ((char) cp))
2155 AddCharMap ((char) cp, 0xA, 1, 0);
2156 // SPECIAL CASES: added here
2157 AddCharMap ('\u3004', 0xA, 1, 0);
2158 AddCharMap ('\u327F', 0xA, 1, 0);
2160 for (int cp = 0x2600; cp <= 0x2613; cp++)
2161 AddCharMap ((char) cp, 0xA, 1, 0);
2163 for (int cp = 0x2620; cp <= 0x2770; cp++)
2164 if (Char.IsSymbol ((char) cp))
2165 AddCharMap ((char) cp, 0xA, 1, 0);
2167 for (int i = 0x2440; i < 0x2460; i++)
2168 AddCharMap ((char) i, 0xA, 1, 0);
2170 // SPECIAL CASES: why?
2171 AddCharMap ('\u0E3F', 0xA, 1, 0);
2172 AddCharMap ('\u2117', 0xA, 1, 0);
2173 AddCharMap ('\u20AC', 0xA, 1, 0);
2176 #region Numbers // 0C 02 - 0C E1
2177 fillIndex [0xC] = 2;
2179 // 9F8 : Bengali "one less than the denominator"
2180 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2182 ArrayList numbers = new ArrayList ();
2183 for (int i = 0; i < 65536; i++)
2184 if (!IsIgnorable (i) &&
2185 Char.IsNumber ((char) i) &&
2186 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2189 ArrayList numberValues = new ArrayList ();
2190 foreach (int i in numbers)
2191 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2192 // SPECIAL CASE: Cyrillic Thousand sign
2193 numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2194 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2196 //foreach (DictionaryEntry de in numberValues)
2197 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2199 // FIXME: fillIndex adjustment lines are too
2200 // complicated. It must be simpler.
2201 decimal prevValue = -1;
2202 foreach (DictionaryEntry de in numberValues) {
2203 int cp = (int) de.Key;
2204 decimal currValue = (decimal) de.Value;
2205 bool addnew = false;
2206 if (prevValue < currValue &&
2207 prevValue - (int) prevValue == 0 &&
2211 // Process Hangzhou and Roman numbers
2213 // There are some SPECIAL cases.
2214 if (currValue != 4) // no increment for 4
2218 if (currValue <= 13) {
2222 if (currValue == 11)
2223 AddCharMap ('\u0BF0', 0xC, 1);
2224 xcp = (int) prevValue + 0x2160 - 1;
2225 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2226 xcp = (int) prevValue + 0x2170 - 1;
2227 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2232 if (currValue <= 10) {
2233 xcp = (int) prevValue + 0x3021 - 1;
2234 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2238 if (prevValue < currValue)
2239 prevValue = currValue;
2240 if (map [cp].Defined)
2242 // HangZhou and Roman are add later
2244 if (0x3021 <= cp && cp < 0x302A
2245 || 0x2160 <= cp && cp < 0x216C
2246 || 0x2170 <= cp && cp < 0x217C)
2249 if (cp == 0x215B) // FIXME: why?
2250 fillIndex [0xC] += 2;
2251 else if (cp == 0x3021) // FIXME: why?
2253 if (addnew || cp <= '9') {
2254 int mod = (int) currValue - 1;
2256 if (1 <= currValue && currValue <= 11) {
2258 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2260 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2262 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2264 if (1 <= currValue && currValue <= 20) {
2266 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2268 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2270 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2273 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2275 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2278 // Maybe Bengali digit numbers do not increase
2279 // indexes, but 0x09E6 does.
2280 case 0x09E7: case 0x09E8: case 0x09E9:
2283 case 0x0BF0: case 0x2180: case 0x2181:
2290 if (currValue < 11 || currValue == 1000)
2295 // Add special cases that are not regarded as
2296 // numbers in UnicodeCategory speak.
2299 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2300 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2302 else if (cp == '2' || cp == '6') // FIXME: why?
2307 fillIndex [0xC] = 0xFF;
2308 AddCharMap ('\u221E', 0xC, 1);
2311 #region Letters and NonSpacing Marks (general)
2313 // ASCII Latin alphabets
2314 for (int i = 0; i < alphabets.Length; i++)
2315 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2317 // non-ASCII Latin alphabets
2318 // FIXME: there is no such characters that are placed
2319 // *after* "alphabets" array items. This is nothing
2320 // more than a hack that creates dummy weight for
2321 // primary characters.
2322 for (int i = 0x0080; i < 0x0300; i++) {
2323 if (!Char.IsLetter ((char) i))
2325 // For those Latin Letters which has NFKD are
2326 // not added as independent primary character.
2327 if (decompIndex [i] != 0)
2330 // 1.some alphabets have primarily
2331 // equivalent ASCII alphabets.
2332 // 2.some have independent primary weights,
2333 // but inside a-to-z range.
2334 // 3.there are some expanded characters that
2335 // are not part of Unicode Standard NFKD.
2336 // 4. some characters are letter in IsLetter
2337 // but not in sortkeys (maybe unicode version
2338 // difference caused it).
2340 // 1. skipping them does not make sense
2341 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2342 // case 0x184: case 0x185: case 0x186: case 0x189:
2343 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2344 // case 0x194: case 0x195: case 0x196: case 0x19A:
2345 // case 0x19B: case 0x19C:
2346 // 2. skipping them does not make sense
2347 // case 0x14A: // Ng
2348 // case 0x14B: // ng
2352 case 0xDE: // Icelandic Thorn
2353 case 0xFE: // Icelandic Thorn
2354 case 0xDF: // German ss
2355 case 0xFF: // German ss
2357 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2358 // not classified yet
2359 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2360 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2361 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2365 AddCharMapGroup ((char) i, 0xE, 1, 0);
2369 // FIXME: this results in not equivalent values to
2370 // Windows, but is safer for comparison.
2371 char [] ipaArray = new char [0x300 - 0x250 + 0x20];
2372 for (int i = 0x40; i < 0x60; i++)
2373 if (Char.IsLetter ((char) i))
2374 ipaArray [i - 0x40] = (char) (i);
2375 for (int i = 0x250; i < 0x300; i++)
2376 if (Char.IsLetter ((char) i))
2377 ipaArray [i - 0x250 + 0x20] = (char) i;
2378 Array.Sort (ipaArray, UCAComparer.Instance);
2379 int targetASCII = 0;
2380 byte latinDiacritical = 0x7B;
2381 foreach (char c in ipaArray) {
2384 latinDiacritical = 0x7B;
2387 map [(int) c] = new CharMapEntry (
2389 map [targetASCII].Level1,
2390 latinDiacritical++);
2395 // FIXME: this is (mysterious and) incomplete.
2396 for (int i = 0x0380; i < 0x0400; i++)
2397 if (diacritical [i] == 0 &&
2398 decompLength [i] == 1 &&
2399 decompType [i] == DecompositionCompat)
2400 diacritical [i] = 3;
2402 fillIndex [0xF] = 2;
2403 for (int i = 0x0391; i < 0x03AA; i++)
2405 AddCharMap ((char) i, 0xF, 1,
2407 fillIndex [0xF] = 2;
2408 for (int i = 0x03B1; i < 0x03CA; i++)
2410 AddCharMap ((char) i, 0xF, 1,
2413 map [0x03C2] = new CharMapEntry (0xF,
2414 map [0x03C3].Level1, map [0x03C3].Level2);
2416 fillIndex [0xF] = 0x40;
2417 for (int i = 0x03DA; i < 0x03F0; i++)
2418 AddCharMap ((char) i, 0xF,
2419 (byte) (i % 2 == 0 ? 0 : 2),
2423 for (int i = 0x0386; i <= 0x0400; i++)
2424 FillLetterNFKD (i, true, true);
2427 // Cyrillic letters are sorted like Latin letters i.e.
2428 // containing culture-specific letters between the
2429 // standard Cyrillic sequence.
2431 // We can't use UCA here; it has different sorting.
2432 char [] orderedCyrillic = new char [] {
2433 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2434 '\u0452', // DJE for Serbocroatian
2436 '\u0454', // IE for Ukrainian
2440 '\u0456', // Byelorussian-Ukrainian I
2450 '\u043F', '\u0440', '\u0441', '\u0442',
2451 '\u045B', // TSHE for Serbocroatian
2453 '\u045E', // Short U for Byelorussian
2454 '\u04B1', // Straight U w/ stroke (diacritical!)
2455 '\u0444', '\u0445', '\u0446', '\u0447',
2457 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2458 '\u044D', '\u044E', '\u044F'};
2460 // For some characters here is a map to basic cyrillic
2461 // letters. See UnicodeData.txt character names for
2462 // the sources. Here I simply declare an equiv. array.
2463 // The content characters are map from U+490(,491),
2464 // skipping small letters.
2465 char [] cymap_src = new char [] {
2466 '\u0433', '\u0433', '\u0433', '\u0436',
2467 '\u0437', '\u043A', '\u043A', '\u043A',
2468 '\u043A', '\u043D', '\u043D', '\u043F',
2469 '\u0445', '\u0441', '\u0442', '\u0443',
2470 '\u0443', '\u0445', '\u0446', '\u0447',
2471 '\u0447', '\u0432', '\u0435', '\u0435',
2472 '\u0406', '\u0436', '\u043A', '\u043D',
2473 '\u0447', '\u0435'};
2475 fillIndex [0x10] = 0x8D;
2476 for (int i = 0x0460; i < 0x0481; i++) {
2477 if (Char.IsLetter ((char) i)) {
2479 // U+476/477 have the same
2480 // primary weight as U+474/475.
2481 fillIndex [0x10] -= 3;
2482 AddLetterMap ((char) i, 0x10, 3);
2486 fillIndex [0x10] = 0x6;
2487 for (int i = 0; i < orderedCyrillic.Length; i++) {
2488 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2489 if (!IsIgnorable ((int) c) &&
2490 Char.IsLetter (c) &&
2492 AddLetterMap (c, 0x10, 0);
2493 fillIndex [0x10] += 3;
2498 for (int i = 0x0401; i <= 0x045F; i++)
2499 FillLetterNFKD (i, false, false);
2501 for (int i = 0; i < cymap_src.Length; i++) {
2502 char c = cymap_src [i];
2503 fillIndex [0x10] = map [c].Level1;
2504 int c2 = 0x0490 + i * 2;
2505 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2509 fillIndex [0x11] = 0x3;
2510 fillIndex [0x1] = 0x98;
2511 for (int i = 0x0531; i < 0x0586; i++) {
2512 if (i == 0x0559 || i == 0x55A)
2513 AddCharMap ((char) i, 1, 1);
2514 if (Char.IsLetter ((char) i))
2515 AddLetterMap ((char) i, 0x11, 1);
2520 fillIndex [0x12] = 0x2;
2521 for (int i = 0x05D0; i < 0x05FF; i++)
2522 if (Char.IsLetter ((char) i)) {
2523 if (isUppercase [i]) {
2525 AddLetterMap ((char) i, 0x12, 2);
2528 AddLetterMap ((char) i, 0x12, 1);
2531 fillIndex [0x1] = 0x3;
2532 for (int i = 0x0591; i <= 0x05C2; i++) {
2533 if (i == 0x05A3 || i == 0x05BB)
2536 AddCharMap ((char) i, 0x1, 1);
2540 fillIndex [0x1] = 0x8E;
2541 fillIndex [0x13] = 0x3;
2542 for (int i = 0x0621; i <= 0x064A; i++) {
2544 if (Char.GetUnicodeCategory ((char) i)
2545 != UnicodeCategory.OtherLetter) {
2546 // FIXME: arabic nonspacing marks are
2547 // in different order.
2548 AddCharMap ((char) i, 0x1, 1);
2551 // map [i] = new CharMapEntry (0x13,
2552 // (byte) arabicLetterPrimaryValues [i], 1);
2554 (byte) arabicLetterPrimaryValues [i];
2555 byte formDiacritical = 8; // default
2558 case 0x0622: formDiacritical = 9; break;
2559 case 0x0623: formDiacritical = 0xA; break;
2560 case 0x0624: formDiacritical = 5; break;
2561 case 0x0625: formDiacritical = 0xB; break;
2562 case 0x0626: formDiacritical = 7; break;
2563 case 0x0649: formDiacritical = 5; break;
2564 case 0x064A: formDiacritical = 7; break;
2566 // AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2567 AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
2569 for (int i = 0x0670; i < 0x0673; i++)
2570 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2571 fillIndex [0x13] = 0x84;
2572 for (int i = 0x0674; i < 0x06D6; i++)
2573 if (Char.IsLetter ((char) i))
2574 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2578 // FIXME: this could be fixed in more decent way
2579 for (int i = 0x0958; i <= 0x095F; i++)
2580 diacritical [i] = 8;
2582 // FIXME: it does seem straight codepoint mapping.
2583 fillIndex [0x14] = 04;
2584 for (int i = 0x0901; i < 0x0905; i++)
2585 if (!IsIgnorable (i))
2586 AddLetterMap ((char) i, 0x14, 2);
2587 fillIndex [0x14] = 0xB;
2588 for (int i = 0x0905; i < 0x093A; i++) {
2590 AddCharMap ('\u0929', 0x14, 0, 8);
2592 AddCharMap ('\u0931', 0x14, 0, 8);
2594 AddCharMap ('\u0934', 0x14, 0, 8);
2595 if (Char.IsLetter ((char) i))
2596 AddLetterMap ((char) i, 0x14, 4);
2598 AddCharMap ('\u0960', 0x14, 4);
2600 AddCharMap ('\u0961', 0x14, 4);
2602 fillIndex [0x14] = 0xDA;
2603 for (int i = 0x093E; i < 0x0945; i++)
2604 if (!IsIgnorable (i))
2605 AddLetterMap ((char) i, 0x14, 2);
2606 fillIndex [0x14] = 0xEC;
2607 for (int i = 0x0945; i < 0x094F; i++)
2608 if (!IsIgnorable (i))
2609 AddLetterMap ((char) i, 0x14, 2);
2613 fillIndex [0x15] = 02;
2614 for (int i = 0x0980; i < 0x9FF; i++) {
2615 if (IsIgnorable (i))
2618 fillIndex [0x15] = 0x3B;
2619 switch (Char.GetUnicodeCategory ((char) i)) {
2620 case UnicodeCategory.NonSpacingMark:
2621 case UnicodeCategory.DecimalDigitNumber:
2622 case UnicodeCategory.OtherNumber:
2625 AddLetterMap ((char) i, 0x15, 1);
2628 fillIndex [0x1] = 0x3;
2629 for (int i = 0x0981; i < 0x0A00; i++)
2630 if (Char.GetUnicodeCategory ((char) i) ==
2631 UnicodeCategory.NonSpacingMark)
2632 AddCharMap ((char) i, 0x1, 1);
2634 // Gurmukhi. orderedGurmukhi is from UCA
2635 // FIXME: it does not look equivalent to UCA.
2636 fillIndex [0x16] = 04;
2637 fillIndex [0x1] = 3;
2638 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2639 char c = orderedGurmukhi [i];
2640 if (IsIgnorable ((int) c))
2642 if (IsIgnorableNonSpacing (c)) {
2643 AddLetterMap (c, 0x1, 1);
2646 if (c == '\u0A3C' || c == '\u0A4D' ||
2647 '\u0A66' <= c && c <= '\u0A71')
2652 case '\u0A33': case '\u0A36': case '\u0A16':
2653 case '\u0A17': case '\u0A5B': case '\u0A5E':
2657 if (c == '\u0A3E') // Skip
2658 fillIndex [0x16] = 0xC0;
2659 AddLetterMap (c, 0x16, shift);
2662 // Gujarati. orderedGujarati is from UCA
2663 fillIndex [0x17] = 0x4;
2665 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2666 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2667 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2668 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2669 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2670 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2671 // letters go first.
2672 for (int i = 0; i < orderedGujarati.Length; i++) {
2674 char c = orderedGujarati [i];
2675 if (Char.IsLetter (c)) {
2677 if (c == '\u0AB3' || c == '\u0A32')
2679 if (c == '\u0A33') {
2680 AddCharMap ('\u0A32', 0x17, 0);
2681 AddCharMap ('\u0A33', 0x17, 4, 4);
2685 AddCharMap ('\u0AE0', 0x17, 0, 5);
2686 AddCharMap (c, 0x17, 4);
2689 AddCharMap ('\u0AB3', 0x17, 6);
2693 byte gujaratiShift = 4;
2694 fillIndex [0x17] = 0xC0;
2695 for (int i = 0; i < orderedGujarati.Length; i++) {
2696 char c = orderedGujarati [i];
2697 if (fillIndex [0x17] == 0xCC)
2699 if (!Char.IsLetter (c)) {
2702 AddCharMap ('\u0A81', 0x17, 2);
2705 AddLetterMap (c, 0x17, gujaratiShift);
2710 fillIndex [0x1] = 03;
2711 fillIndex [0x18] = 02;
2712 for (int i = 0x0B00; i < 0x0B7F; i++) {
2713 switch (Char.GetUnicodeCategory ((char) i)) {
2714 case UnicodeCategory.NonSpacingMark:
2715 case UnicodeCategory.DecimalDigitNumber:
2716 AddLetterMap ((char) i, 0x1, 1);
2719 AddLetterMapCore ((char) i, 0x18, 1, 0, true);
2723 fillIndex [0x19] = 2;
2724 AddCharMap ('\u0BD7', 0x19, 0);
2725 fillIndex [0x19] = 0xA;
2727 for (int i = 0x0B82; i <= 0x0B94; i++)
2728 if (!IsIgnorable ((char) i))
2729 AddCharMap ((char) i, 0x19, 2);
2731 fillIndex [0x19] = 0x28;
2732 // The array for Tamil consonants is a constant.
2733 // Windows have almost similar sequence to TAM from
2734 // tamilnet but a bit different in Grantha.
2735 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2736 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2738 fillIndex [0x19] = 0x82;
2739 for (int i = 0x0BBE; i < 0x0BCD; i++)
2740 if (Char.GetUnicodeCategory ((char) i) ==
2741 UnicodeCategory.SpacingCombiningMark
2743 AddLetterMap ((char) i, 0x19, 2);
2746 fillIndex [0x1A] = 0x4;
2747 for (int i = 0x0C00; i < 0x0C62; i++) {
2748 if (i == 0x0C55 || i == 0x0C56)
2750 AddCharMap ((char) i, 0x1A, 3);
2751 char supp = (i == 0x0C0B) ? '\u0C60':
2752 i == 0x0C0C ? '\u0C61' : char.MinValue;
2753 if (supp == char.MinValue)
2755 AddCharMap (supp, 0x1A, 3);
2759 fillIndex [0x1B] = 4;
2760 for (int i = 0x0C80; i < 0x0CE5; i++) {
2761 if (i == 0x0CD5 || i == 0x0CD6)
2763 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2764 continue; // shift after 0xCB9
2765 AddCharMap ((char) i, 0x1B, 3);
2767 // SPECIAL CASES: but why?
2768 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2769 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2770 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2773 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2777 fillIndex [0x1C] = 2;
2778 fillIndex [0x1] = 3;
2779 for (int i = 0x0D02; i < 0x0D61; i++) {
2780 // FIXME: I avoided MSCompatUnicodeTable usage
2781 // here (it results in recursion). So check if
2782 // using NonSpacingMark makes sense or not.
2783 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2784 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2785 AddCharMap ((char) i, 0x1C, 1);
2786 else if (!IsIgnorable ((char) i))
2787 AddCharMap ((char) i, 1, 1);
2790 // Thai ... note that it breaks 0x1E wall after E2B!
2791 // Also, all Thai characters have level 2 value 3.
2792 fillIndex [0x1E] = 2;
2793 fillIndex [0x1] = 3;
2794 for (int i = 0xE40; i <= 0xE44; i++)
2795 AddCharMap ((char) i, 0x1E, 1, 3);
2796 for (int i = 0xE01; i < 0xE2B; i++)
2797 AddCharMap ((char) i, 0x1E, 6, 3);
2798 fillIndex [0x1F] = 5;
2799 for (int i = 0xE2B; i < 0xE30; i++)
2800 AddCharMap ((char) i, 0x1F, 6, 3);
2801 fillIndex [0x1F] = 0x1E;
2802 for (int i = 0xE30; i < 0xE3B; i++)
2803 AddCharMap ((char) i, 0x1F, 1, 3);
2804 // some Thai characters remains.
2805 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2806 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2807 foreach (char c in specialThai)
2808 AddCharMap (c, 0x1F, 1, 3);
2810 for (int i = 0xE00; i < 0xE80; i++)
2811 if (Char.GetUnicodeCategory ((char) i) ==
2812 UnicodeCategory.NonSpacingMark)
2813 AddCharMap ((char) i, 1, 1);
2816 fillIndex [0x1F] = 2;
2817 fillIndex [0x1] = 3;
2818 for (int i = 0xE80; i < 0xEDF; i++) {
2819 if (IsIgnorable ((char) i))
2821 else if (Char.IsLetter ((char) i))
2822 AddCharMap ((char) i, 0x1F, 1);
2823 else if (Char.GetUnicodeCategory ((char) i) ==
2824 UnicodeCategory.NonSpacingMark)
2825 AddCharMap ((char) i, 1, 1);
2828 // Georgian. orderedGeorgian is from UCA DUCET.
2829 fillIndex [0x21] = 5;
2830 for (int i = 0; i < orderedGeorgian.Length; i++) {
2831 char c = orderedGeorgian [i];
2832 if (map [(int) c].Defined)
2834 AddCharMap (c, 0x21, 0);
2836 AddCharMap ((char) (c - 0x30), 0x21, 0);
2837 fillIndex [0x21] += 5;
2841 fillIndex [0x22] = 2;
2842 int kanaOffset = 0x3041;
2843 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2845 for (int gyo = 0; gyo < 9; gyo++) {
2846 for (int dan = 0; dan < 5; dan++) {
2847 if (gyo == 7 && dan % 2 == 1) {
2850 kanaOffset -= 2; // There is no space for yi and ye.
2853 int cp = kanaOffset + dan * kanaLines [gyo];
2854 // small lines (a-gyo, ya-gyo)
2855 if (gyo == 0 || gyo == 7) {
2856 AddKanaMap (cp, 1); // small
2857 AddKanaMap (cp + 1, 1);
2860 AddKanaMap (cp, kanaLines [gyo]);
2864 // add small 'ka' (before normal one)
2865 AddKanaMap (0x30F5, 1);
2869 // add small 'ke' (before normal one)
2870 AddKanaMap (0x30F6, 1);
2874 // add small 'Tsu' (before normal one)
2875 AddKanaMap (0x3063, 1);
2879 fillIndex [0x22] += 3;
2880 kanaOffset += 5 * kanaLines [gyo];
2883 // Wa-gyo is almost special, so I just manually add.
2884 AddLetterMap ((char) 0x308E, 0x22, 0);
2885 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2886 AddLetterMap ((char) 0x308F, 0x22, 0);
2887 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2889 AddLetterMap ((char) 0x3090, 0x22, 0);
2890 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2891 fillIndex [0x22] += 2;
2892 // no "Wu" in Japanese.
2893 AddLetterMap ((char) 0x3091, 0x22, 0);
2894 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2896 AddLetterMap ((char) 0x3092, 0x22, 0);
2897 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2899 fillIndex [0x22] = 0x80;
2900 AddLetterMap ((char) 0x3093, 0x22, 0);
2901 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2903 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2904 map [0x30A6].Level1, 3);// voiced hiragana U
2905 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2906 map [0x30A6].Level1, 3);// voiced katakana U
2908 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2909 map [0x30AB].Level1, 0);// small katakana Ka
2910 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2911 map [0x30B1].Level1, 0);// small katakana Ke
2913 for (int i = 0x30F7; i < 0x30FB; i++)
2914 map [i] = new CharMapEntry (map [i - 8].Category,
2918 // JIS Japanese square chars.
2919 fillIndex [0x22] = 0x97;
2920 jisJapanese.Sort (JISComparer.Instance);
2921 foreach (JISCharacter j in jisJapanese)
2922 if (0x3300 <= j.CP && j.CP <= 0x3357)
2923 AddCharMap ((char) j.CP, 0x22, 1);
2924 // non-JIS Japanese square chars.
2925 nonJisJapanese.Sort (NonJISComparer.Instance);
2926 foreach (NonJISCharacter j in nonJisJapanese)
2927 AddCharMap ((char) j.CP, 0x22, 1);
2930 fillIndex [0x23] = 0x02;
2931 for (int i = 0x3105; i <= 0x312C; i++)
2932 AddCharMap ((char) i, 0x23, 1);
2934 // Estrangela: ancient Syriac
2935 fillIndex [0x24] = 0x0B;
2936 // FIXME: is 0x71E really alternative form?
2937 ArrayList syriacAlternatives = new ArrayList (
2938 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2939 for (int i = 0x0710; i <= 0x072C; i++) {
2940 if (i == 0x0711) // NonSpacingMark
2942 if (syriacAlternatives.Contains (i))
2944 AddCharMap ((char) i, 0x24, 4);
2949 foreach (int cp in syriacAlternatives)
2950 map [cp] = new CharMapEntry (0x24,
2951 (byte) (map [cp - 1].Level1 + 2),
2953 // FIXME: Syriac NonSpacingMark should go here.
2956 // FIXME: it turned out that it does not look like UCA
2957 fillIndex [0x24] = 0x6E;
2958 fillIndex [0x1] = 0xAC;
2959 for (int i = 0; i < orderedThaana.Length; i++) {
2960 char c = orderedThaana [i];
2961 if (IsIgnorableNonSpacing ((int) c))
2962 AddCharMap (c, 1, 1);
2963 AddCharMap (c, 0x24, 2);
2964 if (c == '\u0782') // SPECIAL CASE: why?
2965 fillIndex [0x24] += 2;
2969 // FIXME: Add more culture-specific letters (that are
2970 // not supported in Windows collation) here.
2972 // Surrogate ... they are computed.
2977 // Unlike UCA Windows Hangul sequence mixes Jongseong
2978 // with Choseong sequence as well as Jungseong,
2979 // adjusted to have the same primary weight for the
2980 // same base character. So it is impossible to compute
2983 // Here I introduce an ordered sequence of mixed
2984 // 'commands' and 'characters' that is similar to
2986 // - ',' increases primary weight.
2987 // - [A B] means a range, increasing index
2988 // - {A B} means a range, without increasing index
2989 // - '=' is no operation (it means the characters
2990 // of both sides have the same weight).
2991 // - '>' inserts a Hangul Syllable block that
2992 // contains 0x251 characters.
2993 // - '<' decreases the index
2994 // - '0'-'9' means skip count
2995 // - whitespaces are ignored
2998 string hangulSequence =
2999 + "\u1100=\u11A8 > \u1101=\u11A9 >"
3000 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
3001 + "<{\u1113 \u1116}, \u3165,"
3002 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
3003 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
3004 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
3005 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
3006 + "[\u11D1 \u11D2], \u11B2,"
3007 + "[\u11D3 \u11D5], \u11B3,"
3008 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
3009 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
3010 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
3011 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
3012 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
3013 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
3014 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
3015 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
3016 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
3017 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
3018 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
3019 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
3020 + "\u11F1,, \u11F2,,,"
3021 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
3022 + "<\u114D, \u110D,, >"
3023 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
3024 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
3025 + "\u1110=\u11C0 > \u1111=\u11C1 >"
3026 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
3027 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
3031 byte hangulCat = 0x52;
3032 fillIndex [hangulCat] = 0x2;
3034 int syllableBlock = 0;
3035 for (int n = 0; n < hangulSequence.Length; n++) {
3036 char c = hangulSequence [n];
3038 if (Char.IsWhiteSpace (c))
3044 IncrementSequentialIndex (ref hangulCat);
3047 if (fillIndex [hangulCat] == 2)
3048 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
3049 fillIndex [hangulCat]--;
3052 IncrementSequentialIndex (ref hangulCat);
3053 for (int l = 0; l < 0x15; l++)
3054 for (int v = 0; v < 0x1C; v++) {
3056 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
3057 IncrementSequentialIndex (ref hangulCat);
3062 start = hangulSequence [n + 1];
3063 end = hangulSequence [n + 3];
3064 for (int i = start; i <= end; i++) {
3065 AddCharMap ((char) i, hangulCat, 0);
3067 IncrementSequentialIndex (ref hangulCat);
3069 n += 4; // consumes 5 characters for this operation
3072 start = hangulSequence [n + 1];
3073 end = hangulSequence [n + 3];
3074 for (int i = start; i <= end; i++)
3075 AddCharMap ((char) i, hangulCat, 0);
3076 n += 4; // consumes 5 characters for this operation
3079 AddCharMap (c, hangulCat, 0);
3085 for (int i = 0x3200; i < 0x3300; i++) {
3086 if (IsIgnorable (i) || map [i].Defined)
3090 if (decompLength [i] == 4 &&
3091 decompValues [decompIndex [i]] == '(')
3092 ch = decompIndex [i] + 1;
3094 else if (decompLength [i] == 2 &&
3095 decompValues [decompIndex [i] + 1] == '\u1161')
3096 ch = decompIndex [i];
3097 else if (decompLength [i] == 1)
3098 ch = decompIndex [i];
3101 ch = decompValues [ch];
3102 if (ch < 0x1100 || 0x1200 < ch &&
3103 ch < 0xAC00 || 0xD800 < ch)
3107 int offset = i < 0x3260 ? 1 : 0;
3108 if (0x326E <= i && i <= 0x3273)
3111 map [i] = new CharMapEntry (map [ch].Category,
3112 (byte) (map [ch].Level1 + offset),
3114 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
3120 // Letterlike characters and CJK compatibility square
3121 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
3122 int [] counts = new int ['Z' - 'A' + 1];
3123 char [] namedChars = new char [sortableCharNames.Count];
3125 foreach (DictionaryEntry de in sortableCharNames) {
3126 counts [((string) de.Value) [0] - 'A']++;
3127 namedChars [nCharNames++] = (char) ((int) de.Key);
3129 nCharNames = 0; // reset
3130 for (int a = 0; a < counts.Length; a++) {
3131 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
3132 for (int i = 0; i < counts [a]; i++)
3133 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
3134 AddCharMap (namedChars [nCharNames++], 0xE, 1);
3137 // CJK unified ideograph.
3139 fillIndex [cjkCat] = 0x2;
3140 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
3141 if (!IsIgnorable (cp))
3142 AddCharMapGroupCJK ((char) cp, ref cjkCat);
3143 // CJK Extensions goes here.
3144 // LAMESPEC: With this Windows style CJK layout, it is
3145 // impossible to add more CJK ideograph i.e. 0x9FA6-
3146 // 0x9FBB can never be added w/o breaking compat.
3147 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
3148 if (!IsIgnorable (cp))
3149 AddCharMapGroupCJK ((char) cp, ref cjkCat);
3151 // PrivateUse ... computed.
3152 // remaining Surrogate ... computed.
3154 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
3155 // non-alphanumeric ASCII except for: + - < = > '
3156 for (int i = 0x21; i < 0x7F; i++) {
3157 // SPECIAL CASE: 02C6 looks regarded as
3158 // equivalent to '^', which does not conform
3159 // to Unicode standard character database.
3161 AddCharMap ('\u2045', 0x7, 0, 0x1C);
3163 AddCharMap ('\u2046', 0x7, 0, 0x1C);
3165 AddCharMap ('\u02C6', 0x7, 0, 3);
3167 AddCharMap ('\u02CB', 0x7, 0, 3);
3169 if (Char.IsLetterOrDigit ((char) i)
3170 || "+-<=>'".IndexOf ((char) i) >= 0)
3171 continue; // they are not added here.
3173 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3174 // Insert 3001 after ',' and 3002 after '.'
3176 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
3178 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
3180 AddCharMap ('\uFE30', 0x7, 1, 0);
3184 #region 07 - Punctuations and something else
3185 for (int i = 0xA0; i < char.MaxValue; i++) {
3186 if (IsIgnorable (i))
3189 // FIXME: actually those reset should not be
3190 // done but here I put for easy goal.
3194 fillIndex [0x7] = 0xE2;
3196 fillIndex [0x7] = 0x77;
3198 fillIndex [0x7] = 0x93;
3200 if (0x02C8 <= i && i <= 0x02CD)
3201 continue; // nonspacing marks
3203 // SPECIAL CASE: maybe they could be allocated
3204 // dummy NFKD mapping and no special processing
3205 // would be required here.
3207 AddCharMap ('\u02C9', 0x7, 0, 3);
3209 AddCharMap ('\u02CA', 0x7, 0, 3);
3211 AddCharMap ('\u02D8', 0x7, 0, 3);
3225 switch (Char.GetUnicodeCategory ((char) i)) {
3226 case UnicodeCategory.OtherPunctuation:
3227 case UnicodeCategory.ClosePunctuation:
3228 case UnicodeCategory.OpenPunctuation:
3229 case UnicodeCategory.ConnectorPunctuation:
3230 case UnicodeCategory.InitialQuotePunctuation:
3231 case UnicodeCategory.FinalQuotePunctuation:
3232 case UnicodeCategory.ModifierSymbol:
3233 // SPECIAL CASES: // 0xA
3234 if (0x2020 <= i && i <= 0x2031)
3236 if (i == 0x3003) // added later
3238 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3241 if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3242 goto case UnicodeCategory.OtherPunctuation;
3248 // FIXME: it should not need to reset level 1, but
3249 // it's for easy goal.
3250 fillIndex [0x7] = 0xB6;
3251 for (int i = 0x2400; i <= 0x2424; i++)
3252 AddCharMap ((char) i, 0x7, 1, 0);
3254 // FIXME: what are they?
3255 AddCharMap ('\u3003', 0x7, 1);
3256 AddCharMap ('\u3006', 0x7, 1);
3257 AddCharMap ('\u02D0', 0x7, 1);
3258 AddCharMap ('\u10FB', 0x7, 1);
3259 AddCharMap ('\u0950', 0x7, 1);
3260 AddCharMap ('\u093D', 0x7, 1);
3261 AddCharMap ('\u0964', 0x7, 1);
3262 AddCharMap ('\u0965', 0x7, 1);
3263 AddCharMap ('\u0970', 0x7, 1);
3267 #region category 08 - symbols
3268 fillIndex [0x8] = 2;
3269 // Here Windows mapping is not straightforward. It is
3270 // not based on computation but seems manual sorting.
3271 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3272 AddCharMapGroup ('\u2212', 0x8, 1); // minus
3273 AddCharMapGroup ('\u229D', 0x8, 1); // minus
3274 AddCharMapGroup ('\u2297', 0x8, 1); // mul
3275 AddCharMapGroup ('\u2044', 0x8, 1); // div
3276 AddCharMapGroup ('\u2215', 0x8, 0); // div
3277 AddCharMapGroup ('\u2298', 0x8, 1); // div slash
3278 AddCharMapGroup ('\u2217', 0x8, 0); // mul
3279 AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
3280 AddCharMapGroup ('\u2218', 0x8, 0); // ring
3281 AddCharMapGroup ('\u229A', 0x8, 1); // ring
3282 AddCharMapGroup ('\u2219', 0x8, 0); // bullet
3283 AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
3284 AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
3285 AddCharMapGroup ('\u003C', 0x8, 1); // <
3286 AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
3287 AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
3289 for (int cp = 0; cp < 0x2300; cp++) {
3290 if (cp == 0xAC) // SPECIAL CASE: skip
3293 cp = 0x2200; // skip to 2200
3294 fillIndex [0x8] = 0x21;
3297 fillIndex [0x8] = 0x3;
3299 fillIndex [0x8] = 0xAB;
3301 fillIndex [0x8] = 0xB9;
3302 if (!map [cp].Defined &&
3303 // Char.GetUnicodeCategory ((char) cp) ==
3304 // UnicodeCategory.MathSymbol)
3305 Char.IsSymbol ((char) cp))
3306 AddCharMapGroup ((char) cp, 0x8, 1);
3307 // SPECIAL CASES: no idea why Windows sorts as such
3310 AddCharMap ('\u227B', 0x8, 1, 0);
3311 AddCharMap ('\u22B1', 0x8, 1, 0);
3314 AddCharMapGroup ('\u00AB', 0x8, 1);
3315 AddCharMapGroup ('\u226A', 0x8, 1);
3316 AddCharMapGroup ('\u00BB', 0x8, 1);
3317 AddCharMapGroup ('\u226B', 0x8, 1);
3320 AddCharMap ('\u01C0', 0x8, 1, 0);
3321 AddCharMap ('\u01C1', 0x8, 1, 0);
3322 AddCharMap ('\u01C2', 0x8, 1, 0);
3330 // Characters w/ diacritical marks (NFKD)
3331 for (int i = 0; i <= char.MaxValue; i++) {
3332 if (map [i].Defined || IsIgnorable (i))
3334 if (decompIndex [i] == 0)
3337 int start = decompIndex [i];
3338 int primaryChar = decompValues [start];
3339 int secondary = diacritical [i];
3341 int length = decompLength [i];
3342 // special processing for parenthesized ones.
3344 decompValues [start] == '(' &&
3345 decompValues [start + 2] == ')') {
3346 primaryChar = decompValues [start + 1];
3350 if (map [primaryChar].Level1 == 0)
3353 for (int l = 1; l < length; l++) {
3354 int c = decompValues [start + l];
3355 if (map [c].Level1 != 0)
3357 secondary += diacritical [c];
3361 map [i] = new CharMapEntry (
3362 map [primaryChar].Category,
3363 map [primaryChar].Level1,
3368 // Diacritical weight adjustment
3371 diacritical [0x624] = 0x5;
3372 diacritical [0x626] = 0x7;
3373 diacritical [0x622] = 0x9;
3374 diacritical [0x623] = 0xA;
3375 diacritical [0x625] = 0xB;
3376 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3377 diacritical [0x64A] = 0x7; // Yaa'
3379 for (int i = 0; i < char.MaxValue; i++) {
3381 byte cat = map [i].Category;
3383 case 0xE: // Latin diacritics
3384 case 0x22: // Japanese: circled characters
3385 mod = diacritical [i];
3387 case 0x13: // Arabic
3390 if (diacritical [i] == 0 && decompLength [i] != 0)
3391 diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
3392 if (diacritical [i] == 0 && i >= 0xFE8D)
3393 mod = 0x8; // default for arabic
3396 if (0x52 <= cat && cat <= 0x7F) // Hangul
3397 mod = diacritical [i];
3399 map [i] = new CharMapEntry (
3400 cat, map [i].Level1, mod);
3403 // FIXME: this is halfly hack but those NonSpacingMark
3404 // characters and still undefined are likely to
3406 for (int i = 0; i < char.MaxValue; i++) {
3407 if (map [i].Defined ||
3416 if (Char.GetUnicodeCategory ((char) i) !=
3417 UnicodeCategory.NonSpacingMark)
3421 if (diacritical [i] != 0)
3422 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3424 AddCharMap ((char) i, 1, 1);
3430 TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
3432 private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
3434 if (map [i].Defined)
3436 int up = (int) ti.ToUpper ((char) i);
3437 if (checkUpper && map [up].Category == 0xF) {
3440 FillLetterNFKD (up, checkUpper, greekRemap);
3441 map [i] = new CharMapEntry (0xF,
3445 int idx = decompIndex [i];
3448 int primary = decompValues [decompIndex [i]];
3449 FillLetterNFKD (primary, checkUpper, greekRemap);
3451 int lv2 = map [primary].Level2;
3453 for (int l = 1; l < decompLength [i]; l++) {
3454 int tmp = decompValues [idx + l];
3455 if (map [tmp].Category != 1)
3457 if (greekRemap && map [tmp].Level2 == 0xC)
3460 off += map [tmp].Level2;
3467 // ... but override if the value already exists.
3468 if (diacritical [i] != 0)
3469 lv2 = diacritical [i];
3470 map [i] = new CharMapEntry (
3471 map [primary].Category,
3472 map [primary].Level1,
3477 private void IncrementSequentialIndex (ref byte hangulCat)
3479 fillIndex [hangulCat]++;
3480 if (fillIndex [hangulCat] == 0) { // overflown
3482 fillIndex [hangulCat] = 0x2;
3486 // Reset fillIndex to fixed value and call AddLetterMap().
3487 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3489 fillIndex [category] = alphaWeight;
3490 AddLetterMap (c, category, 0);
3492 ArrayList al = latinMap [c] as ArrayList;
3496 foreach (int cp in al)
3497 AddLetterMap ((char) cp, category, 0);
3500 private void AddKanaMap (int i, byte voices)
3502 for (byte b = 0; b < voices; b++) {
3503 char c = (char) (i + b);
3504 byte arg = (byte) (b > 0 ? b + 2 : 0);
3506 AddLetterMapCore (c, 0x22, 0, arg, false);
3508 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3512 private void AddLetterMap (char c, byte category, byte updateCount)
3514 AddLetterMapCore (c, category, updateCount, 0, true);
3517 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3520 // <small> updates index
3521 c2 = ToSmallForm (c);
3523 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3524 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3525 if (c2 != c && !map [(int) c2].Defined)
3526 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3527 bool doUpdate = true;
3528 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3531 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3533 fillIndex [category] += updateCount;
3536 private bool AddCharMap (char c, byte category, byte increment)
3538 return AddCharMap (c, category, increment, 0);
3541 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3543 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3544 return false; // do nothing
3545 map [(int) c] = new CharMapEntry (category,
3546 category == 1 ? alt : fillIndex [category],
3547 category == 1 ? fillIndex [category] : alt);
3548 fillIndex [category] += increment;
3553 // Adds characters to table in the order below
3554 // (+ increases weight):
3558 // <full> | <super> | <sub>
3559 // <circle> | <wide> (| <narrow>)
3563 // level2 is fixed (does not increase).
3564 int [] sameWeightItems = new int [] {
3565 DecompositionFraction,
3569 DecompositionCircle,
3571 DecompositionNarrow,
3573 private void AddCharMapGroup (char c, byte category, byte updateCount)
3575 AddCharMapGroup (c, category, updateCount, 0, true);
3578 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3580 AddCharMapGroup (c, category, updateCount, level2, false);
3583 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3585 if (map [(int) c].Defined)
3589 level2 = diacritical [(int) c];
3591 char small = char.MinValue;
3592 char vertical = char.MinValue;
3593 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3595 object smv = nfkd [(byte) DecompositionSmall];
3597 small = (char) ((int) smv);
3598 object vv = nfkd [(byte) DecompositionVertical];
3600 vertical = (char) ((int) vv);
3603 // <small> updates index
3604 if (small != char.MinValue) {
3605 if (level2 == 0 && deferLevel2)
3606 level2 = diacritical [small];
3607 AddCharMap (small, category, updateCount, level2);
3611 AddCharMap (c, category, 0, level2);
3614 foreach (int weight in sameWeightItems) {
3615 object wv = nfkd [(byte) weight];
3618 level2 = diacritical [(int) wv];
3619 AddCharMap ((char) ((int) wv), category, 0, level2);
3624 // update index here.
3625 fillIndex [category] += updateCount;
3627 if (vertical != char.MinValue) {
3628 if (level2 == 0 && deferLevel2)
3629 level2 = diacritical [vertical];
3630 AddCharMap (vertical, category, updateCount, level2);
3634 private void AddCharMapCJK (char c, ref byte category)
3636 AddCharMap (c, category, 0, 0);
3637 IncrementSequentialIndex (ref category);
3639 // Special. I wonder why but Windows skips 9E F9.
3640 if (category == 0x9E && fillIndex [category] == 0xF9)
3641 IncrementSequentialIndex (ref category);
3644 private void AddCharMapGroupCJK (char c, ref byte category)
3646 AddCharMapCJK (c, ref category);
3648 // LAMESPEC: see below.
3649 if (c == '\u5B78') {
3650 AddCharMapCJK ('\u32AB', ref category);
3651 AddCharMapCJK ('\u323B', ref category);
3653 if (c == '\u52DE') {
3654 AddCharMapCJK ('\u3298', ref category);
3655 AddCharMapCJK ('\u3238', ref category);
3658 AddCharMapCJK ('\u32A2', ref category);
3660 // Especially this mapping order totally does
3661 // not make sense to me.
3662 AddCharMapCJK ('\u32A9', ref category);
3664 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3667 for (byte weight = 0; weight <= 0x12; weight++) {
3668 object wv = nfkd [weight];
3673 // Special: they are ignored in this area.
3674 // FIXME: check if it is sane
3675 if (0xF900 <= w && w <= 0xFAD9)
3677 // LAMESPEC: on Windows some of CJK characters
3678 // in 3200-32B0 are incorrectly mapped. They
3679 // mix Chinise and Japanese Kanji when
3680 // ordering those characters.
3682 case 0x32A2: case 0x3298: case 0x3238:
3683 case 0x32A9: case 0x323B: case 0x32AB:
3687 AddCharMapCJK ((char) w, ref category);
3691 // For now it is only for 0x7 category.
3692 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3694 if (map [(int) c].Defined)
3697 bool updateWeight = false;
3698 // Process in advance (lower primary weight)
3699 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3700 if (!map [c2].Defined &&
3701 decompLength [c2] == 1 &&
3702 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3703 switch (decompType [c2]) {
3704 case DecompositionSmall:
3705 updateWeight = true;
3706 AddCharMap ((char) c2, category,
3713 fillIndex [category] = (byte)
3714 (fillIndex [category] + updateCount);
3717 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3718 if (!map [c2].Defined &&
3719 decompLength [c2] == 1 &&
3720 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3721 switch (decompType [c2]) {
3722 case DecompositionSub:
3723 case DecompositionSuper:
3724 case DecompositionWide:
3725 case DecompositionNarrow:
3726 AddCharMap ((char) c2, category,
3734 AddCharMap (c, category, updateCount, level2);
3736 // Since nfkdMap is problematic to have two or more
3737 // NFKD to an identical character, here I iterate all.
3738 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3739 if (!map [c2].Defined &&
3740 decompLength [c2] == 1 &&
3741 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3742 switch (decompType [c2]) {
3743 case DecompositionWide:
3744 case DecompositionNarrow:
3745 case DecompositionSmall:
3746 case DecompositionSub:
3747 case DecompositionSuper:
3750 AddCharMap ((char) c2, category, updateCount, level2);
3757 private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
3760 AddCharMap (c, category, 0, level2);
3762 // Since nfkdMap is problematic to have two or more
3763 // NFKD to an identical character, here I iterate all.
3764 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3765 if (decompLength [c2] == 0)
3767 int idx = decompIndex [c2] + decompLength [c2] - 1;
3768 if ((int) (decompValues [idx]) == (int) c)
3769 AddCharMap ((char) c2, category,
3772 fillIndex [category] += updateCount;
3775 char ToSmallForm (char c)
3777 return ToDecomposed (c, DecompositionSmall, false);
3780 char ToDecomposed (char c, byte d, bool tail)
3782 if (decompType [(int) c] != d)
3784 int idx = decompIndex [(int) c];
3786 idx += decompLength [(int) c] - 1;
3787 return (char) decompValues [idx];
3790 bool ExistsJIS (int cp)
3792 foreach (JISCharacter j in jisJapanese)
3800 #region Level 3 properties (Case/Width)
3802 private byte ComputeLevel3Weight (char c)
3804 byte b = ComputeLevel3WeightRaw (c);
3805 return b > 0 ? (byte) (b + 2) : b;
3808 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3811 if ('\u3192' <= c && c <= '\u319F')
3814 // They have <narrow> NFKD mapping, and on Windows
3815 // those narrow characters are regarded as "normal",
3816 // thus those characters themselves are regarded as
3817 // "wide". grep "<narrow>" and you can pick them up
3818 // (ignoring Kana, Hangul etc.)
3835 if ('\u11A8' <= c && c <= '\u11F9')
3837 if ('\uFFA0' <= c && c <= '\uFFDC')
3839 if ('\u3130' <= c && c <= '\u3164')
3841 if ('\u3165' <= c && c <= '\u318E')
3843 // Georgian Capital letters
3844 if ('\u10A0' <= c && c <= '\u10C5')
3847 if ('\u2776' <= c && c <= '\u277F')
3849 if ('\u2780' <= c && c <= '\u2789')
3851 if ('\u2776' <= c && c <= '\u2793')
3853 if ('\u2160' <= c && c <= '\u216F')
3855 if ('\u2181' <= c && c <= '\u2182')
3858 if ('\u2135' <= c && c <= '\u2138')
3860 // I believe that Windows has a bug on setting level 3
3861 // weight here. NFKD results in different values.
3862 if ('\uFE80' < c && c < '\uFF00') {
3863 // 2(Isolated)/8(Final)/0x18(Medial)
3864 switch (decompType [(int) c]) {
3865 case DecompositionIsolated:
3867 case DecompositionFinal:
3869 case DecompositionMedial:
3871 case DecompositionInitial:
3876 // I have no idea why those symbols have level 3 weight
3877 if (c == '\u2104' || c == '\u212B')
3879 if ('\u211E' <= c && c <= '\u212B')
3882 // actually I dunno the reason why they have weights.
3911 switch (decompType [(int) c]) {
3912 case DecompositionWide: // <wide>
3913 case DecompositionSub: // <sub>
3914 case DecompositionSuper: // <super>
3915 ret |= decompType [(int) c];
3918 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3920 if (isUppercase [(int) c]) // DerivedCoreProperties
3930 static bool IsIgnorable (int i)
3932 if (unicodeAge [i] >= 3.1)
3934 switch (char.GetUnicodeCategory ((char) i)) {
3935 case UnicodeCategory.OtherNotAssigned:
3936 case UnicodeCategory.Format:
3943 // FIXME: In the future use DerivedAge.txt to examine character
3944 // versions and set those ones that have higher version than
3945 // 1.0 as ignorable.
3946 static bool IsIgnorable (int i)
3950 // I guess, those characters are added between
3951 // Unicode 1.0 (LCMapString) and Unicode 3.1
3952 // (UnicodeCategory), so they used to be
3953 // something like OtherNotAssigned as of Unicode 1.1.
3954 case 0x2df: case 0x387:
3955 case 0x3d7: case 0x3d8: case 0x3d9:
3956 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3957 case 0x400: case 0x40d: case 0x450: case 0x45d:
3958 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3959 case 0x653: case 0x654: case 0x655: case 0x66d:
3961 case 0x1e9b: case 0x202f: case 0x20ad:
3962 case 0x20ae: case 0x20af:
3963 case 0x20e2: case 0x20e3:
3964 case 0x2139: case 0x213a: case 0x2183:
3965 case 0x2425: case 0x2426: case 0x2619:
3966 case 0x2670: case 0x2671: case 0x3007:
3967 case 0x3190: case 0x3191:
3968 case 0xfffc: case 0xfffd:
3970 // exceptional characters filtered by the
3971 // following conditions. Originally those exceptional
3972 // ranges are incorrect (they should not be ignored)
3973 // and most of those characters are unfortunately in
3975 case 0x4d8: case 0x4d9:
3976 case 0x4e8: case 0x4e9:
3978 case 0x3036: case 0x303f:
3979 case 0x337b: case 0xfb1e:
3984 // The whole Sinhala characters.
3985 0x0D82 <= i && i <= 0x0DF4
3986 // The whole Tibetan characters.
3987 || 0x0F00 <= i && i <= 0x0FD1
3988 // The whole Myanmar characters.
3989 || 0x1000 <= i && i <= 0x1059
3990 // The whole Etiopic, Cherokee,
3991 // Canadian Syllablic, Ogham, Runic,
3992 // Tagalog, Hanunoo, Philippine,
3993 // Buhid, Tagbanwa, Khmer and Mongorian
3995 || 0x1200 <= i && i <= 0x1DFF
3996 // Greek extension characters.
3997 || 0x1F00 <= i && i <= 0x1FFF
3998 // The whole Braille characters.
3999 || 0x2800 <= i && i <= 0x28FF
4000 // CJK radical characters.
4001 || 0x2E80 <= i && i <= 0x2EF3
4002 // Kangxi radical characters.
4003 || 0x2F00 <= i && i <= 0x2FD5
4004 // Ideographic description characters.
4005 || 0x2FF0 <= i && i <= 0x2FFB
4006 // Bopomofo letter and final
4007 || 0x31A0 <= i && i <= 0x31B7
4008 // White square with quadrant characters.
4009 || 0x25F0 <= i && i <= 0x25F7
4010 // Ideographic telegraph symbols.
4011 || 0x32C0 <= i && i <= 0x32CB
4012 || 0x3358 <= i && i <= 0x3370
4013 || 0x33E0 <= i && i <= 0x33FF
4014 // The whole YI characters.
4015 || 0xA000 <= i && i <= 0xA48C
4016 || 0xA490 <= i && i <= 0xA4C6
4017 // American small ligatures
4018 || 0xFB13 <= i && i <= 0xFB17
4019 // hebrew, arabic, variation selector.
4020 || 0xFB1D <= i && i <= 0xFE2F
4021 // Arabic ligatures.
4022 || 0xFEF5 <= i && i <= 0xFEFC
4023 // FIXME: why are they excluded?
4024 || 0x01F6 <= i && i <= 0x01F9
4025 || 0x0218 <= i && i <= 0x0233
4026 || 0x02A9 <= i && i <= 0x02AD
4027 || 0x02EA <= i && i <= 0x02EE
4028 || 0x0349 <= i && i <= 0x036F
4029 || 0x0488 <= i && i <= 0x048F
4030 || 0x04D0 <= i && i <= 0x04FF
4031 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
4032 || 0x06D6 <= i && i <= 0x06ED
4033 || 0x06FA <= i && i <= 0x06FE
4034 || 0x2048 <= i && i <= 0x204D
4035 || 0x20e4 <= i && i <= 0x20ea
4036 || 0x213C <= i && i <= 0x214B
4037 || 0x21EB <= i && i <= 0x21FF
4038 || 0x22F2 <= i && i <= 0x22FF
4039 || 0x237B <= i && i <= 0x239A
4040 || 0x239B <= i && i <= 0x23CF
4041 || 0x24EB <= i && i <= 0x24FF
4042 || 0x2596 <= i && i <= 0x259F
4043 || 0x25F8 <= i && i <= 0x25FF
4044 || 0x2672 <= i && i <= 0x2689
4045 || 0x2768 <= i && i <= 0x2775
4046 || 0x27d0 <= i && i <= 0x27ff
4047 || 0x2900 <= i && i <= 0x2aff
4048 || 0x3033 <= i && i <= 0x303F
4049 || 0x31F0 <= i && i <= 0x31FF
4050 || 0x3250 <= i && i <= 0x325F
4051 || 0x32B1 <= i && i <= 0x32BF
4052 || 0x3371 <= i && i <= 0x337B
4053 || 0xFA30 <= i && i <= 0xFA6A
4057 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4059 case UnicodeCategory.PrivateUse:
4060 case UnicodeCategory.Surrogate:
4062 // ignored by nature
4063 case UnicodeCategory.Format:
4064 case UnicodeCategory.OtherNotAssigned:
4071 // To check IsIgnorable sanity, try the driver below under MS.NET.
4074 public static void Main ()
4076 for (int i = 0; i <= char.MaxValue; i++)
4077 Dump (i, IsIgnorable (i));
4080 static void Dump (int i, bool ignore)
4082 switch (Char.GetUnicodeCategory ((char) i)) {
4083 case UnicodeCategory.PrivateUse:
4084 case UnicodeCategory.Surrogate:
4085 return; // check nothing
4089 string s2 = new string ((char) i, 10);
4090 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
4091 if ((ret == 0) == ignore)
4093 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
4096 #endregion // IsIgnorable
4098 #region IsIgnorableSymbol
4099 static bool IsIgnorableSymbol (int i)
4101 if (IsIgnorable (i))
4106 case 0x00b5: case 0x01C0: case 0x01C1:
4107 case 0x01C2: case 0x01C3: case 0x01F6:
4108 case 0x01F7: case 0x01F8: case 0x01F9:
4109 case 0x02D0: case 0x02EE: case 0x037A:
4110 case 0x03D7: case 0x03F3:
4111 case 0x0400: case 0x040d:
4112 case 0x0450: case 0x045d:
4113 case 0x048C: case 0x048D:
4114 case 0x048E: case 0x048F:
4115 case 0x0587: case 0x0640: case 0x06E5:
4116 case 0x06E6: case 0x06FA: case 0x06FB:
4117 case 0x06FC: case 0x093D: case 0x0950:
4118 case 0x1E9B: case 0x2139: case 0x3006:
4119 case 0x3033: case 0x3034: case 0x3035:
4120 case 0xFE7E: case 0xFE7F:
4122 case 0x16EE: case 0x16EF: case 0x16F0:
4124 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
4125 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
4126 case 0x3038: // HANGZHOU NUMERAL TEN
4127 case 0x3039: // HANGZHOU NUMERAL TWENTY
4128 case 0x303a: // HANGZHOU NUMERAL THIRTY
4134 case 0x02B9: case 0x02BA: case 0x02C2:
4135 case 0x02C3: case 0x02C4: case 0x02C5:
4136 case 0x02C8: case 0x02CC: case 0x02CD:
4137 case 0x02CE: case 0x02CF: case 0x02D2:
4138 case 0x02D3: case 0x02D4: case 0x02D5:
4139 case 0x02D6: case 0x02D7: case 0x02DE:
4140 case 0x02E5: case 0x02E6: case 0x02E7:
4141 case 0x02E8: case 0x02E9:
4142 case 0x309B: case 0x309C:
4144 case 0x055A: // American Apos
4145 case 0x05C0: // Hebrew Punct
4146 case 0x0E4F: // Thai FONGMAN
4147 case 0x0E5A: // Thai ANGKHANKHU
4148 case 0x0E5B: // Thai KHOMUT
4150 case 0x09F2: // Bengali Rupee Mark
4151 case 0x09F3: // Bengali Rupee Sign
4153 case 0x221e: // INF.
4162 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
4164 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
4165 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
4170 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4172 case UnicodeCategory.Surrogate:
4173 return false; // inconsistent
4175 case UnicodeCategory.SpacingCombiningMark:
4176 case UnicodeCategory.EnclosingMark:
4177 case UnicodeCategory.NonSpacingMark:
4178 case UnicodeCategory.PrivateUse:
4180 if (0x064B <= i && i <= 0x0652) // Arabic
4184 case UnicodeCategory.Format:
4185 case UnicodeCategory.OtherNotAssigned:
4192 // latin in a circle
4193 0x249A <= i && i <= 0x24E9
4194 || 0x2100 <= i && i <= 0x2132
4196 || 0x3196 <= i && i <= 0x31A0
4198 || 0x3200 <= i && i <= 0x321C
4200 || 0x322A <= i && i <= 0x3243
4202 || 0x3260 <= i && i <= 0x32B0
4203 || 0x32D0 <= i && i <= 0x3357
4204 || 0x337B <= i && i <= 0x33DD
4206 use = !Char.IsLetterOrDigit ((char) i);
4210 // This "Digit" rule is mystery.
4211 // It filters some symbols out.
4212 if (Char.IsLetterOrDigit ((char) i))
4214 if (Char.IsNumber ((char) i))
4216 if (Char.IsControl ((char) i)
4217 || Char.IsSeparator ((char) i)
4218 || Char.IsPunctuation ((char) i))
4220 if (Char.IsSymbol ((char) i))
4223 // FIXME: should check more
4228 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
4230 public static void Main ()
4232 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
4233 for (int i = 0; i <= char.MaxValue; i++) {
4234 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4235 if (uc == UnicodeCategory.Surrogate)
4238 bool ret = IsIgnorableSymbol (i);
4240 string s1 = "TEST ";
4241 string s2 = "TEST " + (char) i;
4243 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
4245 if (ret != (result == 0))
4246 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
4247 ret ? "should not ignore" :
4256 static bool IsIgnorableNonSpacing (int i)
4258 if (IsIgnorable (i))
4262 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
4263 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
4264 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
4266 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
4267 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
4268 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
4269 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
4270 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
4271 case 0x0CCD: case 0x0E4E:
4275 if (0x02b9 <= i && i <= 0x02c5
4276 || 0x02cc <= i && i <= 0x02d7
4277 || 0x02e4 <= i && i <= 0x02ef
4278 || 0x20DD <= i && i <= 0x20E0
4282 if (0x064B <= i && i <= 0x00652
4283 || 0x0941 <= i && i <= 0x0948
4284 || 0x0AC1 <= i && i <= 0x0ACD
4285 || 0x0C3E <= i && i <= 0x0C4F
4286 || 0x0E31 <= i && i <= 0x0E3F
4290 return Char.GetUnicodeCategory ((char) i) ==
4291 UnicodeCategory.NonSpacingMark;
4294 // We can reuse IsIgnorableSymbol testcode
4295 // for IsIgnorableNonSpacing.
4301 public byte Category;
4303 public byte Level2; // It is always single byte.
4304 public bool Defined;
4306 public CharMapEntry (byte category, byte level1, byte level2)
4308 Category = category;
4317 public readonly int CP;
4318 public readonly int JIS;
4320 public JISCharacter (int cp, int cpJIS)
4327 class JISComparer : IComparer
4329 public static readonly JISComparer Instance =
4332 public int Compare (object o1, object o2)
4334 JISCharacter j1 = (JISCharacter) o1;
4335 JISCharacter j2 = (JISCharacter) o2;
4336 return j1.JIS - j2.JIS;
4340 class NonJISCharacter
4342 public readonly int CP;
4343 public readonly string Name;
4345 public NonJISCharacter (int cp, string name)
4352 class NonJISComparer : IComparer
4354 public static readonly NonJISComparer Instance =
4355 new NonJISComparer ();
4357 public int Compare (object o1, object o2)
4359 NonJISCharacter j1 = (NonJISCharacter) o1;
4360 NonJISCharacter j2 = (NonJISCharacter) o2;
4361 return string.CompareOrdinal (j1.Name, j2.Name);
4365 class DecimalDictionaryValueComparer : IComparer
4367 public static readonly DecimalDictionaryValueComparer Instance
4368 = new DecimalDictionaryValueComparer ();
4370 private DecimalDictionaryValueComparer ()
4374 public int Compare (object o1, object o2)
4376 DictionaryEntry e1 = (DictionaryEntry) o1;
4377 DictionaryEntry e2 = (DictionaryEntry) o2;
4378 // FIXME: in case of 0, compare decomposition categories
4379 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4382 int i1 = (int) e1.Key;
4383 int i2 = (int) e2.Key;
4388 class StringDictionaryValueComparer : IComparer
4390 public static readonly StringDictionaryValueComparer Instance
4391 = new StringDictionaryValueComparer ();
4393 private StringDictionaryValueComparer ()
4397 public int Compare (object o1, object o2)
4399 DictionaryEntry e1 = (DictionaryEntry) o1;
4400 DictionaryEntry e2 = (DictionaryEntry) o2;
4401 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4404 int i1 = (int) e1.Key;
4405 int i2 = (int) e2.Key;
4410 class UCAComparer : IComparer
4412 public static readonly UCAComparer Instance
4413 = new UCAComparer ();
4415 private UCAComparer ()
4419 public int Compare (object o1, object o2)
4421 char i1 = (char) o1;
4422 char i2 = (char) o2;
4424 int l1 = CollationElementTable.GetSortKeyCount (i1);
4425 int l2 = CollationElementTable.GetSortKeyCount (i2);
4426 int l = l1 > l2 ? l2 : l1;
4428 for (int i = 0; i < l; i++) {
4429 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4430 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4431 int v = k1.Primary - k2.Primary;
4434 v = k1.Secondary - k2.Secondary;
4437 v = k1.Thirtiary - k2.Thirtiary;
4440 v = k1.Quarternary - k2.Quarternary;
4453 ArrayList items = new ArrayList ();
4455 public Tailoring (int lcid)
4460 public Tailoring (int lcid, int alias)
4467 get { return lcid; }
4471 get { return alias; }
4474 public bool FrenchSort {
4475 get { return frenchSort; }
4476 set { frenchSort = value; }
4479 public void AddDiacriticalMap (byte target, byte replace)
4481 items.Add (new DiacriticalMap (target, replace));
4484 public void AddSortKeyMap (string source, byte [] sortkey)
4486 items.Add (new SortKeyMap (source, sortkey));
4489 public void AddReplacementMap (string source, string replace)
4491 items.Add (new ReplacementMap (source, replace));
4494 public char [] ItemToCharArray ()
4496 ArrayList al = new ArrayList ();
4497 foreach (ITailoringMap m in items)
4498 al.AddRange (m.ToCharArray ());
4499 return al.ToArray (typeof (char)) as char [];
4502 interface ITailoringMap
4504 char [] ToCharArray ();
4507 class DiacriticalMap : ITailoringMap
4509 public readonly byte Target;
4510 public readonly byte Replace;
4512 public DiacriticalMap (byte target, byte replace)
4518 public char [] ToCharArray ()
4520 char [] ret = new char [3];
4521 ret [0] = (char) 02; // kind:DiacriticalMap
4522 ret [1] = (char) Target;
4523 ret [2] = (char) Replace;
4528 class SortKeyMap : ITailoringMap
4530 public readonly string Source;
4531 public readonly byte [] SortKey;
4533 public SortKeyMap (string source, byte [] sortkey)
4539 public char [] ToCharArray ()
4541 char [] ret = new char [Source.Length + 7];
4542 ret [0] = (char) 01; // kind:SortKeyMap
4543 for (int i = 0; i < Source.Length; i++)
4544 ret [i + 1] = Source [i];
4546 for (int i = 0; i < 4; i++)
4547 ret [i + Source.Length + 2] = (char) SortKey [i];
4552 class ReplacementMap : ITailoringMap
4554 public readonly string Source;
4555 public readonly string Replace;
4557 public ReplacementMap (string source, string replace)
4563 public char [] ToCharArray ()
4565 char [] ret = new char [Source.Length + Replace.Length + 3];
4566 ret [0] = (char) 03; // kind:ReplaceMap
4568 for (int i = 0; i < Source.Length; i++)
4569 ret [pos++] = Source [i];
4572 for (int i = 0; i < Replace.Length; i++)
4573 ret [pos++] = Replace [i];