3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
38 namespace Mono.Globalization.Unicode
40 internal class MSCompatSortKeyTableGenerator
42 public static void Main (string [] args)
44 new MSCompatSortKeyTableGenerator ().Run (args);
47 const int DecompositionWide = 1; // fixed
48 const int DecompositionSub = 2; // fixed
49 const int DecompositionSmall = 3;
50 const int DecompositionIsolated = 4;
51 const int DecompositionInitial = 5;
52 const int DecompositionFinal = 6;
53 const int DecompositionMedial = 7;
54 const int DecompositionNoBreak = 8;
55 const int DecompositionVertical = 9;
56 const int DecompositionFraction = 0xA;
57 const int DecompositionFont = 0xB;
58 const int DecompositionSuper = 0xC; // fixed
59 const int DecompositionFull = 0xE;
60 const int DecompositionNarrow = 0xD;
61 const int DecompositionCircle = 0xF;
62 const int DecompositionSquare = 0x10;
63 const int DecompositionCompat = 0x11;
64 const int DecompositionCanonical = 0x12;
66 TextWriter CSResult = Console.Out;
67 TextWriter CResult = TextWriter.Null;
69 byte [] fillIndex = new byte [256]; // by category
70 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
72 char [] specialIgnore = new char [] {
73 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
74 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
77 // FIXME: need more love (as always)
78 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
79 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
80 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
81 '\u0292', '\u01BE', '\u0298'};
82 byte [] alphaWeights = new byte [] {
83 2, 9, 0xA, 0x1A, 0x21,
84 0x23, 0x25, 0x2C, 0x32, 0x35,
85 0x36, 0x48, 0x51, 0x70, 0x7C,
86 0x7E, 0x89, 0x8A, 0x91, 0x99,
87 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
88 0xA9, 0xAA, 0xB3, 0xB4};
90 bool [] isSmallCapital = new bool [char.MaxValue + 1];
91 bool [] isUppercase = new bool [char.MaxValue + 1];
93 byte [] decompType = new byte [char.MaxValue + 1];
94 int [] decompIndex = new int [char.MaxValue + 1];
95 int [] decompLength = new int [char.MaxValue + 1];
97 decimal [] decimalValue = new decimal [char.MaxValue + 1];
99 byte [] diacritical = new byte [char.MaxValue + 1];
101 string [] diacritics = new string [] {
102 // LATIN, CYRILLIC etc.
103 "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
105 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
106 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
107 "WITH ACUTE;", "WITH GRAVE;",
109 "WITH DOT ABOVE;", " MIDDLE DOT;",
110 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
112 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
113 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
114 "ABKHASIAN CHE WITH DESCENDER",
115 "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
116 "WITH OGONEK;", "WITH CEDILLA;",
118 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
119 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
121 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
122 " DIAERESIS AND GRAVE;",
124 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
125 " MACRON AND ACUTE;",
126 " MACRON AND GRAVE;",
128 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
129 " RING ABOVE AND ACUTE",
130 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
131 " CIRCUMFLEX AND TILDE",
132 " TILDE AND DIAERESIS",
135 " CEDILLA AND BREVE",
136 " OGONEK AND MACRON",
138 "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
139 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
143 " PRECEDED BY APOSTROPHE",
145 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
148 " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
149 " RING BELOW", "LOW VERTICAL LINE",
151 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
152 " BREVE BELOW;", " HORN AND GRAVE",
156 " DOT BELOW AND DOT ABOVE",
157 " RIGHT HALF RING", " HORN AND TILDE",
158 " CIRCUMFLEX AND DOT BELOW",
159 " BREVE AND DOT BELOW",
160 " DOT BELOW AND MACRON",
162 " HORN AND HOOK ABOVE",
164 // CIRCLED, PARENTHESIZED and so on
165 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
166 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
167 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
169 byte [] diacriticWeights = new byte [] {
175 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
176 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
178 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
179 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
181 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
182 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
184 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
185 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
188 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
189 0x69, 0x69, 0x6A, 0x6D, 0x6E,
191 // CIRCLED, PARENTHESIZED and so on.
192 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
196 int [] numberSecondaryWeightBounds = new int [] {
197 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
198 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
199 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
200 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
201 0xE50, 0xE60, 0xED0, 0xEE0
204 char [] orderedGurmukhi;
205 char [] orderedGujarati;
206 char [] orderedGeorgian;
207 char [] orderedThaana;
209 static readonly char [] orderedTamilConsonants = new char [] {
210 // based on traditional Tamil consonants, except for
211 // Grantha (where Microsoft breaks traditionalism).
212 // http://www.angelfire.com/empire/thamizh/padanGaL
213 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
214 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
215 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
216 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
219 // cp -> character name (only for some characters)
220 ArrayList sortableCharNames = new ArrayList ();
222 // cp -> arrow value (int)
223 ArrayList arrowValues = new ArrayList ();
225 // cp -> box value (int)
226 ArrayList boxValues = new ArrayList ();
228 // cp -> level1 value
229 Hashtable arabicLetterPrimaryValues = new Hashtable ();
232 Hashtable arabicNameMap = new Hashtable ();
234 // cp -> Hashtable [decompType] -> cp
235 Hashtable nfkdMap = new Hashtable ();
237 // Latin letter -> ArrayList [int]
238 Hashtable latinMap = new Hashtable ();
240 ArrayList jisJapanese = new ArrayList ();
241 ArrayList nonJisJapanese = new ArrayList ();
243 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
244 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
245 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
246 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
247 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
249 byte [] ignorableFlags = new byte [char.MaxValue + 1];
251 static double [] unicodeAge = new double [char.MaxValue + 1];
253 ArrayList tailorings = new ArrayList ();
255 void Run (string [] args)
257 string dirname = args.Length == 0 ? "downloaded" : args [0];
258 ParseSources (dirname);
259 Console.Error.WriteLine ("parse done.");
261 ModifyParsedValues ();
263 Console.Error.WriteLine ("generation done.");
264 CResult = new StreamWriter ("collation-tables.h", false);
267 Console.Error.WriteLine ("serialization done.");
269 StreamWriter sw = new StreamWriter ("agelog.txt");
270 for (int i = 0; i < char.MaxValue; i++) {
271 bool shouldBe = false;
272 switch (Char.GetUnicodeCategory ((char) i)) {
273 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
274 shouldBe = true; break;
276 if (unicodeAge [i] >= 3.1)
278 //if (IsIgnorable (i) != shouldBe)
279 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
285 byte [] CompressArray (byte [] source, CodePointIndexer i)
287 return (byte []) CodePointIndexer.CompressArray (
288 source, typeof (byte), i);
291 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
293 return (ushort []) CodePointIndexer.CompressArray (
294 source, typeof (ushort), i);
297 void WriteByte (byte value)
305 SerializeTailorings ();
307 byte [] categories = new byte [map.Length];
308 byte [] level1 = new byte [map.Length];
309 byte [] level2 = new byte [map.Length];
310 byte [] level3 = new byte [map.Length];
311 // widthCompat is now removed from the mapping table.
312 // If it turned out that it is still required, grep this source and uncomment
313 // widthCompat related lines. FIXME: remove those lines in the future.
314 // ushort [] widthCompat = new ushort [map.Length];
315 for (int i = 0; i < map.Length; i++) {
316 categories [i] = map [i].Category;
317 level1 [i] = map [i].Level1;
318 level2 [i] = map [i].Level2;
319 level3 [i] = ComputeLevel3Weight ((char) i);
321 // For Japanese Half-width characters, don't
322 // map widthCompat. It is IgnoreKanaType that
323 // handles those width differences.
324 if (0xFF6D <= i && i <= 0xFF9D)
326 switch (decompType [i]) {
327 case DecompositionNarrow:
328 case DecompositionWide:
329 case DecompositionSuper:
330 case DecompositionSub:
331 // they are always 1 char
332 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
339 ignorableFlags = CompressArray (ignorableFlags,
341 categories = CompressArray (categories, UUtil.Category);
342 level1 = CompressArray (level1, UUtil.Level1);
343 level2 = CompressArray (level2, UUtil.Level2);
344 level3 = CompressArray (level3, UUtil.Level3);
345 // widthCompat = (ushort []) CodePointIndexer.CompressArray (
346 // widthCompat, typeof (ushort), UUtil.WidthCompat);
347 cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
348 cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
349 cjkJA = CompressArray (cjkJA, UUtil.Cjk);
350 cjkKO = CompressArray (cjkKO, UUtil.Cjk);
351 cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
354 CResult.WriteLine ("static const guint8* collation_table_ignorableFlags [] = {");
355 CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
357 MemoryStream ms = new MemoryStream ();
358 BinaryWriter binary = new BinaryWriter (ms);
359 binary.Write (UUtil.ResourceVersion);
360 binary.Write (ignorableFlags.Length);
362 for (int i = 0; i < ignorableFlags.Length; i++) {
363 byte value = ignorableFlags [i];
365 CSResult.Write ("{0},", value);
367 CSResult.Write ("0x{0:X02},", value);
368 CResult.Write ("{0},", value);
370 binary.Write (value);
372 if ((i & 0xF) == 0xF) {
373 CSResult.WriteLine ("// {0:X04}",
374 UUtil.Ignorable.ToCodePoint (i - 0xF));
375 CResult.WriteLine ();
378 CSResult.WriteLine ("};");
379 CSResult.WriteLine ();
382 CResult.WriteLine ("static const guint8* collation_table_category [] = {");
383 CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
385 binary.Write (categories.Length);
387 for (int i = 0; i < categories.Length; i++) {
388 byte value = categories [i];
390 CSResult.Write ("{0},", value);
392 CSResult.Write ("0x{0:X02},", value);
393 CResult.Write ("{0},", value);
395 binary.Write (value);
397 if ((i & 0xF) == 0xF) {
398 CSResult.WriteLine ("// {0:X04}",
399 UUtil.Category.ToCodePoint (i - 0xF));
400 CResult.WriteLine ();
403 CResult.WriteLine ("};");
404 CSResult.WriteLine ("};");
405 CSResult.WriteLine ();
407 // Primary weight value
408 CResult.WriteLine ("static const guint8* collation_table_level1 [] = {");
409 CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
411 binary.Write (level1.Length);
413 for (int i = 0; i < level1.Length; i++) {
414 byte value = level1 [i];
416 CSResult.Write ("{0},", value);
418 CSResult.Write ("0x{0:X02},", value);
419 CResult.Write ("{0},", value);
421 binary.Write (value);
423 if ((i & 0xF) == 0xF) {
424 CSResult.WriteLine ("// {0:X04}",
425 UUtil.Level1.ToCodePoint (i - 0xF));
426 CResult.WriteLine ();
429 CResult.WriteLine ("0};");
430 CSResult.WriteLine ("};");
431 CSResult.WriteLine ();
434 CResult.WriteLine ("static const guint8* collation_table_level2 [] = {");
435 CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
437 binary.Write (level2.Length);
439 for (int i = 0; i < level2.Length; i++) {
440 byte value = level2 [i];
442 CSResult.Write ("{0},", value);
444 CSResult.Write ("0x{0:X02},", value);
445 CResult.Write ("{0},", value);
447 binary.Write (value);
449 if ((i & 0xF) == 0xF) {
450 CSResult.WriteLine ("// {0:X04}",
451 UUtil.Level2.ToCodePoint (i - 0xF));
452 CResult.WriteLine ();
455 CResult.WriteLine ("0};");
456 CSResult.WriteLine ("};");
457 CSResult.WriteLine ();
460 CResult.WriteLine ("static const guint8* collation_table_level3 [] = {");
461 CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
463 binary.Write (level3.Length);
465 for (int i = 0; i < level3.Length; i++) {
466 byte value = level3 [i];
468 CSResult.Write ("{0},", value);
470 CSResult.Write ("0x{0:X02},", value);
471 CResult.Write ("{0},", value);
473 binary.Write (value);
475 if ((i & 0xF) == 0xF) {
476 CSResult.WriteLine ("// {0:X04}",
477 UUtil.Level3.ToCodePoint (i - 0xF));
478 CResult.WriteLine ();
481 CResult.WriteLine ("0};");
482 CSResult.WriteLine ("};");
483 CSResult.WriteLine ();
486 // Width insensitivity mappings
487 // (for now it is more lightweight than dumping the
488 // entire NFKD table).
489 CResult.WriteLine ("static const guint16* widthCompat [] = {");
490 CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
492 binary.Write (widthCompat.Length);
494 for (int i = 0; i < widthCompat.Length; i++) {
495 ushort value = widthCompat [i];
497 CSResult.Write ("{0},", value);
499 CSResult.Write ("0x{0:X02},", value);
500 CResult.Write ("{0},", value);
502 binary.Write (value);
504 if ((i & 0xF) == 0xF) {
505 CSResult.WriteLine ("// {0:X04}",
506 UUtil.WidthCompat.ToCodePoint (i - 0xF));
507 CResult.WriteLine ();
510 CResult.WriteLine ("0};");
511 CSResult.WriteLine ("};");
512 CSResult.WriteLine ();
516 using (FileStream fs = File.Create ("../collation.core.bin")) {
517 byte [] array = ms.ToArray ();
518 fs.Write (array, 0, array.Length);
523 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
524 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
525 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
526 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
527 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
530 void SerializeCJK (string name, ushort [] cjk, int max_unused)
532 CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
533 CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
535 CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
536 CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
538 MemoryStream ms = new MemoryStream ();
539 BinaryWriter binary = new BinaryWriter (ms);
540 binary.Write (UUtil.ResourceVersion);
541 binary.Write (cjk.Length); // the actual size is *2.
544 for (int i = 0; i < cjk.Length; i++) {
547 byte value = (byte) (cjk [i] >> 8);
549 CSResult.Write ("{0},", value);
551 CSResult.Write ("0x{0:X02},", value);
552 CResult.Write ("{0},", value);
554 binary.Write (value);
556 if ((i & 0xF) == 0xF) {
557 CSResult.WriteLine ("// {0:X04}", i - 0xF);
558 CResult.WriteLine ();
563 for (int i = 0; i < cjk.Length; i++) {
566 byte value = (byte) (cjk [i] & 0xFF);
568 CSResult.Write ("{0},", value);
570 CSResult.Write ("0x{0:X02},", value);
571 CResult.Write ("{0},", value);
573 binary.Write (value);
575 if ((i & 0xF) == 0xF) {
576 CSResult.WriteLine ("// {0:X04}", i - 0xF);
577 CResult.WriteLine ();
581 CResult.WriteLine ("0};");
582 CSResult.WriteLine ("};");
583 CSResult.WriteLine ();
585 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
586 byte [] array = ms.ToArray ();
587 fs.Write (array, 0, array.Length);
592 void SerializeCJK (string name, byte [] cjk, int max)
594 CResult.WriteLine ("static const guint8* collation_table_collation_cjk_{0} [] = {{", name);
595 CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
597 MemoryStream ms = new MemoryStream ();
598 BinaryWriter binary = new BinaryWriter (ms);
599 binary.Write (UUtil.ResourceVersion);
601 for (int i = 0; i < cjk.Length; i++) {
604 byte value = cjk [i];
606 CSResult.Write ("{0},", value);
608 CSResult.Write ("0x{0:X02},", value);
609 CResult.Write ("{0},", value);
611 binary.Write (value);
613 if ((i & 0xF) == 0xF) {
614 CSResult.WriteLine ("// {0:X04}", i - 0xF);
615 CResult.WriteLine ();
618 CResult.WriteLine ("0};");
619 CSResult.WriteLine ("};");
620 CSResult.WriteLine ();
622 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
623 byte [] array = ms.ToArray ();
624 fs.Write (array, 0, array.Length);
629 void SerializeTailorings ()
631 Hashtable indexes = new Hashtable ();
632 Hashtable counts = new Hashtable ();
633 CResult.WriteLine ("static const guint16*collation_table_tailoring = {");
634 CSResult.WriteLine ("static char [] tailorings = new char [] {");
637 MemoryStream ms = new MemoryStream ();
638 BinaryWriter binary = new BinaryWriter (ms);
639 // Here we don't need to output resource version.
642 foreach (Tailoring t in tailorings) {
645 CResult.Write ("/*{0}*/", t.LCID);
646 CSResult.Write ("/*{0}*/", t.LCID);
647 indexes.Add (t.LCID, count);
648 char [] values = t.ItemToCharArray ();
649 counts.Add (t.LCID, values.Length);
650 foreach (char c in values) {
651 CSResult.Write ("'\\x{0:X}', ", (int) c);
652 CResult.Write ("{0},", (int) c);
653 if (++count % 16 == 0) {
654 CSResult.WriteLine (" // {0:X04}", count - 16);
655 CResult.WriteLine ();
658 binary.Write ((ushort) c);
662 CResult.WriteLine ("0};");
663 CSResult.WriteLine ("};");
665 CResult.WriteLine ("static const int collation_tailoring_count = {0};", tailorings.Count);
666 CResult.WriteLine ("static const int* collation_tailoring_infos = {");
667 CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
669 byte [] rawdata = ms.ToArray ();
670 ms = new MemoryStream ();
671 binary = new BinaryWriter (ms);
672 binary.Write (UUtil.ResourceVersion);
673 binary.Write (tailorings.Count);
675 foreach (Tailoring t in tailorings) {
676 int target = t.Alias != 0 ? t.Alias : t.LCID;
677 if (!indexes.ContainsKey (target)) {
678 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
681 int idx = (int) indexes [target];
682 int cnt = (int) counts [target];
683 bool french = t.FrenchSort;
685 foreach (Tailoring t2 in tailorings)
686 if (t2.LCID == t.LCID)
687 french = t2.FrenchSort;
688 CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
689 CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
691 binary.Write (t.LCID);
694 binary.Write (french);
697 CResult.WriteLine ("0};");
698 CSResult.WriteLine ("};");
700 binary.Write ((byte) 0xFF);
701 binary.Write ((byte) 0xFF);
702 binary.Write (rawdata.Length / 2);
703 binary.Write (rawdata, 0, rawdata.Length);
706 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
707 byte [] array = ms.ToArray ();
708 fs.Write (array, 0, array.Length);
715 void ParseSources (string dirname)
718 dirname + "/UnicodeData.txt";
719 string derivedCoreProps =
720 dirname + "/DerivedCoreProperties.txt";
722 dirname + "/Scripts.txt";
724 dirname + "/CP932.TXT";
726 dirname + "/DerivedAge.txt";
727 string chXML = dirname + "/common/collation/zh.xml";
728 string jaXML = dirname + "/common/collation/ja.xml";
729 string koXML = dirname + "/common/collation/ko.xml";
731 ParseDerivedAge (derivedAge);
735 ParseJISOrder (cp932); // in prior to ParseUnidata()
736 ParseUnidata (unidata);
738 ParseDerivedCoreProperties (derivedCoreProps);
739 ParseScripts (scripts);
740 ParseCJK (chXML, jaXML, koXML);
742 ParseTailorings ("mono-tailoring-source.txt");
745 void ParseTailorings (string filename)
749 using (StreamReader sr = new StreamReader (filename)) {
751 while (sr.Peek () >= 0) {
753 ProcessTailoringLine (ref t,
754 sr.ReadLine ().Trim ());
756 } catch (Exception) {
757 Console.Error.WriteLine ("ERROR at line {0}", line);
763 // For now this is enough.
764 string ParseTailoringSourceValue (string s)
766 StringBuilder sb = new StringBuilder ();
767 for (int i = 0; i < s.Length; i++) {
768 if (i + 5 < s.Length &&
769 s [i] == '\\' && s [i + 1] == 'u') {
772 s.Substring (i + 2, 4),
773 NumberStyles.HexNumber),
780 return sb.ToString ();
783 void ProcessTailoringLine (ref Tailoring t, string s)
785 int idx = s.IndexOf ('#');
787 s = s.Substring (0, idx).Trim ();
788 if (s.Length == 0 || s [0] == '#')
791 idx = s.IndexOf ('=');
794 int.Parse (s.Substring (1, idx - 1)),
795 int.Parse (s.Substring (idx + 1)));
797 t = new Tailoring (int.Parse (s.Substring (1)));
801 if (s.StartsWith ("*FrenchSort")) {
805 string d = "*Diacritical";
806 if (s.StartsWith (d)) {
807 idx = s.IndexOf ("->");
808 t.AddDiacriticalMap (
809 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
810 NumberStyles.HexNumber),
811 byte.Parse (s.Substring (idx + 2).Trim (),
812 NumberStyles.HexNumber));
815 idx = s.IndexOf (':');
817 string source = s.Substring (0, idx).Trim ();
818 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
819 byte [] b = new byte [4];
820 for (int i = 0; i < 4; i++) {
824 b [i] = byte.Parse (l [i],
825 NumberStyles.HexNumber);
827 t.AddSortKeyMap (ParseTailoringSourceValue (source),
830 idx = s.IndexOf ('=');
832 t.AddReplacementMap (
833 ParseTailoringSourceValue (
834 s.Substring (0, idx).Trim ()),
835 ParseTailoringSourceValue (
836 s.Substring (idx + 1).Trim ()));
839 void ParseDerivedAge (string filename)
841 using (StreamReader file =
842 new StreamReader (filename)) {
843 while (file.Peek () >= 0) {
844 string s = file.ReadLine ();
845 int idx = s.IndexOf ('#');
847 s = s.Substring (0, idx);
848 idx = s.IndexOf (';');
852 string cpspec = s.Substring (0, idx);
853 idx = cpspec.IndexOf ("..");
854 NumberStyles nf = NumberStyles.HexNumber |
855 NumberStyles.AllowTrailingWhite;
856 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
857 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
858 string value = s.Substring (cpspec.Length + 1).Trim ();
861 if (cp > char.MaxValue)
864 double v = double.Parse (value);
865 for (int i = cp; i <= cpEnd; i++)
869 unicodeAge [0] = double.MaxValue; // never be supported
872 void ParseUnidata (string filename)
874 ArrayList decompValues = new ArrayList ();
875 using (StreamReader unidata =
876 new StreamReader (filename)) {
877 for (int line = 1; unidata.Peek () >= 0; line++) {
879 ProcessUnidataLine (unidata.ReadLine (), decompValues);
880 } catch (Exception) {
881 Console.Error.WriteLine ("**** At line " + line);
886 this.decompValues = (int [])
887 decompValues.ToArray (typeof (int));
890 char previousLatinTarget = char.MinValue;
891 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
893 void ProcessUnidataLine (string s, ArrayList decompValues)
895 int idx = s.IndexOf ('#');
897 s = s.Substring (0, idx);
898 idx = s.IndexOf (';');
901 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
902 string [] values = s.Substring (idx + 1).Split (';');
905 if (cp > char.MaxValue)
907 if (IsIgnorable (cp))
910 string name = values [0];
912 // SPECIAL CASE: rename some characters for diacritical
913 // remapping. FIXME: why are they different?
914 // FIXME: it's still not working.
915 if (cp == 0x018B || cp == 0x018C)
916 name = name.Replace ("TOPBAR", "STROKE");
919 if (s.IndexOf ("SMALL CAPITAL") > 0)
920 isSmallCapital [cp] = true;
922 // latin mapping by character name
923 if (s.IndexOf ("LATIN") >= 0) {
924 int lidx = s.IndexOf ("LETTER DOTLESS ");
925 int offset = lidx + 15;
927 lidx = s.IndexOf ("LETTER TURNED ");
931 lidx = s.IndexOf ("LETTER CAPITAL ");
935 lidx = s.IndexOf ("LETTER SCRIPT ");
939 lidx = s.IndexOf ("LETTER ");
942 char c = lidx > 0 ? s [offset] : char.MinValue;
943 char n = s [offset + 1];
944 char target = char.MinValue;
945 if ('A' <= c && c <= 'Z' &&
946 (n == ' ') || n == ';') {
948 // FIXME: After 'Z', I cannot reset this state.
949 previousLatinTarget = c == 'Z' ? char.MinValue : c;
952 if (s.Substring (offset).StartsWith ("ALPHA"))
954 else if (s.Substring (offset).StartsWith ("TONE SIX"))
956 else if (s.Substring (offset).StartsWith ("OPEN O"))
958 else if (s.Substring (offset).StartsWith ("ETH"))
960 else if (s.Substring (offset).StartsWith ("SCHWA"))
962 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
964 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
966 else if (s.Substring (offset).StartsWith ("TONE TWO"))
968 else if (s.Substring (offset).StartsWith ("ESH"))
970 else if (s.Substring (offset).StartsWith ("OUNCE"))
973 // For remaining IPA chars, direct mapping is
976 case 0x0166: case 0x0167:
977 // Though they are 'T', they have different weight
978 target = char.MinValue; break;
979 case 0x0299: target = 'B'; break;
980 case 0x029A: target = 'E'; break;
981 case 0x029B: target = 'G'; break;
982 case 0x029C: target = 'H'; break;
983 case 0x029D: target = 'J'; break;
984 case 0x029E: target = 'K'; break;
985 case 0x029F: target = 'L'; break;
986 case 0x02A0: target = 'Q'; break;
987 case 0x02A7: target = 'T'; break;
988 case 0x02A8: target = 'T'; break;
991 if (target == char.MinValue)
992 target = previousLatinTarget;
994 if (target != char.MinValue) {
995 ArrayList entry = (ArrayList) latinMap [target];
997 entry = new ArrayList ();
998 latinMap [target] = entry;
1001 // FIXME: This secondary weight is hack.
1002 // They are here because they must not
1003 // be identical to the corresponding
1005 if (c != target && diacritical [cp] == 0) {
1006 diacriticalOffset [c - 'A']++;
1007 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
1013 if (0x2000 <= cp && cp < 0x3000) {
1015 // SPECIAL CASES. FIXME: why?
1017 case 0x21C5: value = -1; break; // E2
1018 case 0x261D: value = 1; break;
1019 case 0x27A6: value = 3; break;
1020 case 0x21B0: value = 7; break;
1021 case 0x21B1: value = 3; break;
1022 case 0x21B2: value = 7; break;
1023 case 0x21B4: value = 5; break;
1024 case 0x21B5: value = 7; break;
1025 case 0x21B9: value = -1; break; // E1
1026 case 0x21CF: value = 7; break;
1027 case 0x21D0: value = 3; break;
1029 string [] arrowTargets = new string [] {
1042 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
1043 s.IndexOf ("LEFTWARDS") >= 0)
1044 value = 0xE1 - 0xD8;
1045 else if (s.IndexOf ("UPWARDS") >= 0 &&
1046 s.IndexOf ("DOWNWARDS") >= 0)
1047 value = 0xE2 - 0xD8;
1048 else if (s.IndexOf ("ARROW") >= 0 &&
1049 s.IndexOf ("COMBINING") < 0 &&
1050 s.IndexOf ("CLOCKWISE") >= 0)
1051 value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
1053 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
1054 if (s.IndexOf (arrowTargets [i]) > 0 &&
1055 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
1056 s.IndexOf (" OVER") < 0
1060 arrowValues.Add (new DictionaryEntry (
1065 if (0x2500 <= cp && cp < 0x2600) {
1066 int value = int.MinValue;
1068 // up:1 down:2 right:4 left:8 vert:16 horiz:32
1071 // [dr] [dl] [ur] [ul]
1072 // [vr,udr] [vl,vdl]
1073 // [hd,rld] [hu,rlu]
1074 // [hv,udrl,rlv,udh]
1075 ArrayList flags = new ArrayList (new int [] {
1078 4 + 2, 8 + 2, 4 + 1, 8 + 1,
1079 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
1080 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
1081 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
1083 byte [] offsets = new byte [] {
1090 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
1092 if (s.IndexOf (" UP") >= 0)
1094 if (s.IndexOf (" DOWN") >= 0)
1096 if (s.IndexOf (" RIGHT") >= 0)
1098 if (s.IndexOf (" LEFT") >= 0)
1100 if (s.IndexOf (" VERTICAL") >= 0)
1102 if (s.IndexOf (" HORIZONTAL") >= 0)
1105 int fidx = flags.IndexOf (flag);
1107 value = offsets [fidx];
1108 } else if (s.IndexOf ("BLOCK") >= 0) {
1109 if (s.IndexOf ("ONE EIGHTH") >= 0)
1111 else if (s.IndexOf ("ONE QUARTER") >= 0)
1113 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1115 else if (s.IndexOf ("HALF") >= 0)
1117 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1119 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1121 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1126 else if (s.IndexOf ("SHADE") >= 0)
1128 else if (s.IndexOf ("SQUARE") >= 0)
1129 value = 0xBC - 0xE5;
1130 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1131 value = 0xBE - 0xE5;
1132 else if (s.IndexOf ("RECTANGLE") >= 0)
1133 value = 0xBD - 0xE5;
1134 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1135 value = 0xBF - 0xE5;
1136 else if (s.IndexOf ("TRIANGLE") >= 0) {
1137 if (s.IndexOf ("UP-POINTING") >= 0)
1138 value = 0xC0 - 0xE5;
1139 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1140 value = 0xC1 - 0xE5;
1141 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1142 value = 0xC2 - 0xE5;
1143 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1144 value = 0xC3 - 0xE5;
1146 else if (s.IndexOf ("POINTER") >= 0) {
1147 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1148 value = 0xC4 - 0xE5;
1149 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1150 value = 0xC5 - 0xE5;
1152 else if (s.IndexOf ("DIAMOND") >= 0)
1153 value = 0xC6 - 0xE5;
1154 else if (s.IndexOf ("FISHEYE") >= 0)
1155 value = 0xC7 - 0xE5;
1156 else if (s.IndexOf ("LOZENGE") >= 0)
1157 value = 0xC8 - 0xE5;
1158 else if (s.IndexOf ("BULLSEYE") >= 0)
1159 value = 0xC9 - 0xE5;
1160 else if (s.IndexOf ("CIRCLE") >= 0) {
1161 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1162 value = 0xCA - 0xE5;
1163 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1164 value = 0xCB - 0xE5;
1166 value = 0xC9 - 0xE5;
1168 else if (s.IndexOf ("BULLET") >= 0)
1169 value = 0xCC - 0xE5;
1170 if (0x25DA <= cp && cp <= 0x25E5)
1171 value = 0xCD + cp - 0x25DA - 0xE5;
1173 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1175 case 0x2571: value = 0xF; break;
1176 case 0x2572: value = 0x10; break;
1177 case 0x2573: value = 0x11; break;
1179 if (value != int.MinValue)
1180 boxValues.Add (new DictionaryEntry (
1184 // For some characters store the name and sort later
1185 // to determine sorting.
1186 if (0x2100 <= cp && cp <= 0x213F &&
1187 Char.IsSymbol ((char) cp))
1188 sortableCharNames.Add (
1189 new DictionaryEntry (cp, name));
1190 else if (0x3380 <= cp && cp <= 0x33DD)
1191 sortableCharNames.Add (new DictionaryEntry (
1192 cp, name.Substring (7)));
1194 if (Char.GetUnicodeCategory ((char) cp) ==
1195 UnicodeCategory.MathSymbol) {
1196 if (name.StartsWith ("CIRCLED "))
1197 diacritical [cp] = 0xEE;
1198 if (name.StartsWith ("SQUARED "))
1199 diacritical [cp] = 0xEF;
1202 // diacritical weights by character name
1203 if (diacritics.Length != diacriticWeights.Length)
1204 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1205 for (int d = diacritics.Length - 1; d >= 0; d--) {
1206 if (s.IndexOf (diacritics [d]) > 0) {
1207 diacritical [cp] += diacriticWeights [d];
1208 if (s.IndexOf ("COMBINING") >= 0)
1209 diacritical [cp] -= (byte) 2;
1212 // also process "COMBINING blah" here
1213 // For now it is limited to cp < 0x0370
1214 // if (cp < 0x0300 || cp >= 0x0370)
1216 string tmp = diacritics [d].TrimEnd (';');
1217 if (tmp.IndexOf ("WITH ") == 0)
1218 tmp = tmp.Substring (4);
1219 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1221 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1225 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1227 // Two-step grep required for it.
1228 if (s.IndexOf ("FULL STOP") > 0 &&
1229 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1230 diacritical [cp] |= 0xF4;
1231 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1232 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1233 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1235 // Arabic letter name
1236 if (0x0621 <= cp && cp <= 0x064A &&
1237 Char.GetUnicodeCategory ((char) cp)
1238 == UnicodeCategory.OtherLetter) {
1239 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1244 // hamza, waw, yeh ... special cases.
1249 value = 0x77; // special cases.
1252 // Get primary letter name i.e.
1253 // XXX part of ARABIC LETTER XXX yyy
1254 // e.g. that of "TEH MARBUTA" is "TEH".
1257 // 0x0640 is special: it does
1258 // not start with ARABIC LETTER
1260 name.Substring (14);
1261 int tmpIdx = letterName.IndexOf (' ');
1262 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1263 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1264 if (arabicNameMap.ContainsKey (letterName))
1265 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1267 arabicNameMap [letterName] = cp;
1270 arabicLetterPrimaryValues [cp] = value;
1273 // Japanese square letter
1274 if (0x3300 <= cp && cp <= 0x3357)
1275 if (!ExistsJIS (cp))
1276 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1278 // normalizationType
1279 string decomp = values [4];
1280 idx = decomp.IndexOf ('<');
1282 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1284 decompType [cp] = DecompositionFull;
1287 decompType [cp] = DecompositionSub;
1290 decompType [cp] = DecompositionSuper;
1293 decompType [cp] = DecompositionSmall;
1296 decompType [cp] = DecompositionIsolated;
1299 decompType [cp] = DecompositionInitial;
1302 decompType [cp] = DecompositionFinal;
1305 decompType [cp] = DecompositionMedial;
1308 decompType [cp] = DecompositionNoBreak;
1311 decompType [cp] = DecompositionCompat;
1314 decompType [cp] = DecompositionFraction;
1317 decompType [cp] = DecompositionFont;
1320 decompType [cp] = DecompositionCircle;
1323 decompType [cp] = DecompositionSquare;
1326 decompType [cp] = DecompositionWide;
1329 decompType [cp] = DecompositionNarrow;
1332 decompType [cp] = DecompositionVertical;
1335 throw new Exception ("Support NFKD type : " + decomp);
1339 decompType [cp] = DecompositionCanonical;
1340 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1341 if (decomp.Length > 0) {
1343 string [] velems = decomp.Split (' ');
1344 int didx = decompValues.Count;
1345 decompIndex [cp] = didx;
1346 foreach (string v in velems)
1347 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1348 decompLength [cp] = velems.Length;
1350 // [decmpType] -> this_cp
1351 int targetCP = (int) decompValues [didx];
1352 // for "(x)" it specially maps to 'x' .
1353 // FIXME: check if it is sane
1354 if (velems.Length == 3 &&
1355 (int) decompValues [didx] == '(' &&
1356 (int) decompValues [didx + 2] == ')')
1357 targetCP = (int) decompValues [didx + 1];
1358 // special: 0x215F "1/"
1359 else if (cp == 0x215F)
1361 else if (velems.Length > 1 &&
1362 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1363 // skip them, except for CJK ideograph compat
1366 if (targetCP != 0) {
1367 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1368 if (entry == null) {
1369 entry = new Hashtable ();
1370 nfkdMap [targetCP] = entry;
1372 entry [(byte) decompType [cp]] = cp;
1376 if (values [5].Length > 0)
1377 decimalValue [cp] = decimal.Parse (values [5]);
1378 else if (values [6].Length > 0)
1379 decimalValue [cp] = decimal.Parse (values [6]);
1380 else if (values [7].Length > 0) {
1381 string decstr = values [7];
1382 idx = decstr.IndexOf ('/');
1383 if (cp == 0x215F) // special. "1/"
1384 decimalValue [cp] = 0x1;
1388 decimal.Parse (decstr.Substring (0, idx))
1389 / decimal.Parse (decstr.Substring (idx + 1));
1390 else if (decstr [0] == '(' &&
1391 decstr [decstr.Length - 1] == ')')
1394 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1395 else if (decstr [decstr.Length - 1] == '.')
1398 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1400 decimalValue [cp] = decimal.Parse (decstr);
1404 void ParseDerivedCoreProperties (string filename)
1407 using (StreamReader file =
1408 new StreamReader (filename)) {
1409 for (int line = 1; file.Peek () >= 0; line++) {
1411 ProcessDerivedCorePropLine (file.ReadLine ());
1412 } catch (Exception) {
1413 Console.Error.WriteLine ("**** At line " + line);
1420 void ProcessDerivedCorePropLine (string s)
1422 int idx = s.IndexOf ('#');
1424 s = s.Substring (0, idx);
1425 idx = s.IndexOf (';');
1428 string cpspec = s.Substring (0, idx);
1429 idx = cpspec.IndexOf ("..");
1430 NumberStyles nf = NumberStyles.HexNumber |
1431 NumberStyles.AllowTrailingWhite;
1432 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1433 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1434 string value = s.Substring (cpspec.Length + 1).Trim ();
1437 if (cp > char.MaxValue)
1442 for (int x = cp; x <= cpEnd; x++)
1443 isUppercase [x] = true;
1448 void ParseScripts (string filename)
1450 ArrayList gurmukhi = new ArrayList ();
1451 ArrayList gujarati = new ArrayList ();
1452 ArrayList georgian = new ArrayList ();
1453 ArrayList thaana = new ArrayList ();
1455 using (StreamReader file =
1456 new StreamReader (filename)) {
1457 while (file.Peek () >= 0) {
1458 string s = file.ReadLine ();
1459 int idx = s.IndexOf ('#');
1461 s = s.Substring (0, idx);
1462 idx = s.IndexOf (';');
1466 string cpspec = s.Substring (0, idx);
1467 idx = cpspec.IndexOf ("..");
1468 NumberStyles nf = NumberStyles.HexNumber |
1469 NumberStyles.AllowTrailingWhite;
1470 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1471 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1472 string value = s.Substring (cpspec.Length + 1).Trim ();
1475 if (cp > char.MaxValue)
1480 for (int x = cp; x <= cpEnd; x++)
1481 if (!IsIgnorable (x))
1482 gurmukhi.Add ((char) x);
1485 for (int x = cp; x <= cpEnd; x++)
1486 if (!IsIgnorable (x))
1487 gujarati.Add ((char) x);
1490 for (int x = cp; x <= cpEnd; x++)
1491 if (!IsIgnorable (x))
1492 georgian.Add ((char) x);
1495 for (int x = cp; x <= cpEnd; x++)
1496 if (!IsIgnorable (x))
1497 thaana.Add ((char) x);
1502 gurmukhi.Sort (UCAComparer.Instance);
1503 gujarati.Sort (UCAComparer.Instance);
1504 georgian.Sort (UCAComparer.Instance);
1505 thaana.Sort (UCAComparer.Instance);
1506 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1507 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1508 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1509 orderedThaana = (char []) thaana.ToArray (typeof (char));
1512 void ParseJISOrder (string filename)
1516 using (StreamReader file =
1517 new StreamReader (filename)) {
1518 for (;file.Peek () >= 0; line++)
1519 ProcessJISOrderLine (file.ReadLine ());
1521 } catch (Exception) {
1522 Console.Error.WriteLine ("---- line {0}", line);
1527 char [] ws = new char [] {'\t', ' '};
1529 void ProcessJISOrderLine (string s)
1531 int idx = s.IndexOf ('#');
1533 s = s.Substring (0, idx).Trim ();
1536 idx = s.IndexOfAny (ws);
1539 // They start with "0x" so cut them out.
1540 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1541 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1542 jisJapanese.Add (new JISCharacter (cp, jis));
1545 void ParseCJK (string zhXML, string jaXML, string koXML)
1547 XmlDocument doc = new XmlDocument ();
1548 doc.XmlResolver = null;
1555 // Chinese Simplified
1558 offset = 0;//char.MaxValue - arr.Length;
1560 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1562 foreach (char c in s) {
1564 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1566 arr [(int) c - offset] = (ushort) v++;
1572 // Chinese Traditional
1575 offset = 0;//char.MaxValue - arr.Length;
1576 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1578 foreach (char c in s) {
1580 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1582 arr [(int) c - offset] = (ushort) v++;
1591 offset = 0;//char.MaxValue - arr.Length;
1594 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1595 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1596 arr [0x337E] = 0x8005;
1597 arr [0x337D] = 0x8006;
1598 arr [0x337C] = 0x8007;
1601 foreach (JISCharacter jc in jisJapanese) {
1602 if (jc.JIS < 0x8800)
1604 char c = (char) jc.CP;
1607 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1610 arr [(int) c - offset] = (ushort) v++;
1615 if (c == '\u662D') // U+337C
1617 if (c == '\u5927') // U+337D
1619 if (c == '\u5E73') // U+337B
1621 if (c == '\u660E') // U+337E
1623 if (c == '\u9686') // U+F9DC
1626 // FIXME: there are still remaining
1627 // characters after U+FA0C.
1628 // for (int k = 0; k < char.MaxValue; k++) {
1629 for (int k = 0; k < '\uFA0D'; k++) {
1630 if (decompIndex [k] == 0 || IsIgnorable (k))
1632 if (decompValues [decompIndex [k]] == c /*&&
1633 decompLength [k] == 1*/ ||
1634 decompLength [k] == 3 &&
1635 decompValues [decompIndex [k] + 1] == c) {
1636 arr [k - offset] = (ushort) v++;
1645 // Korean weight is somewhat complex. It first shifts
1646 // Hangul category from 52-x to 80-x (they are anyways
1647 // computed). CJK ideographs are placed at secondary
1648 // weight, like XX YY 01 zz 01, where XX and YY are
1649 // corresponding "reset" value and zz is 41,43,45...
1651 // Unlike chs,cht and ja, Korean value is a combined
1652 // ushort which is computed as category
1656 offset = 0;//char.MaxValue - arr.Length;
1658 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1659 XmlElement sc = (XmlElement) reset.NextSibling;
1660 // compute "category" and "level 1" for the
1661 // target "reset" Hangle syllable
1662 char rc = reset.InnerText [0];
1663 int ri = ((int) rc - 0xAC00) + 1;
1665 ((ri / 254) * 256 + (ri % 254) + 2);
1666 // Place the characters after the target.
1669 foreach (char c in s) {
1670 arr [(int) c - offset] = p;
1671 cjkKOlv2 [(int) c - offset] = (byte) v;
1681 void FillIgnorables ()
1683 for (int i = 0; i <= char.MaxValue; i++) {
1684 if (Char.GetUnicodeCategory ((char) i) ==
1685 UnicodeCategory.OtherNotAssigned)
1687 if (IsIgnorable (i))
1688 ignorableFlags [i] |= 1;
1689 if (IsIgnorableSymbol (i))
1690 ignorableFlags [i] |= 2;
1691 if (IsIgnorableNonSpacing (i))
1692 ignorableFlags [i] |= 4;
1696 void ModifyUnidata ()
1698 ArrayList decompValues = new ArrayList (this.decompValues);
1700 // Hebrew uppercase letters.
1701 foreach (int i in new int []
1702 {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
1703 isUppercase [i] = true;
1706 // Modify some decomposition equivalence
1707 for (int i = 0xFE31; i <= 0xFE34; i++) {
1709 decompIndex [i] = 0;
1710 decompLength [i] = 0;
1712 decompType [0x037E] = 0;
1713 decompIndex [0x037E] = 0;
1714 decompLength [0x037E] = 0;
1717 for (int i = 0x3021; i <= 0x3029; i++)
1718 diacritical [i] = 0x4E;
1719 // Korean parens numbers
1720 for (int i = 0x3200; i <= 0x321C; i++)
1721 diacritical [i] = 0xA;
1722 for (int i = 0x3260; i <= 0x327B; i++)
1723 diacritical [i] = 0xC;
1725 // LAMESPEC: these remapping should not be done.
1726 // Windows have incorrect CJK compat mappings.
1727 decompValues [decompIndex [0x32A9]] = 0x91AB;
1728 decompLength [0x323B] = 1;
1729 decompValues [decompIndex [0x323B]] = 0x5B78;
1730 decompValues [decompIndex [0x32AB]] = 0x5B78;
1731 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1732 decompLength [0x3238] = 1;
1733 decompValues [decompIndex [0x3238]] = 0x52DE;
1734 decompValues [decompIndex [0x3298]] = 0x52DE;
1736 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1737 decompIndex [0xFA0C] = decompValues.Count;
1738 decompValues.Add ((int) 0x5140);
1739 decompLength [0xFA0C] = 1;
1740 decompIndex [0xF929] = decompLength [0xF929] = 0;
1742 decompValues [decompIndex [0xF92C]] = 0x90DE;
1744 decompIndex [0x2125] = decompValues.Count;
1745 decompValues.Add ((int) 0x005A);
1746 decompLength [0x2125] = 1;
1747 decompType [0x2125] = DecompositionFont;
1749 this.decompValues = decompValues.ToArray (typeof (int)) as int [];
1752 void ModifyParsedValues ()
1754 // Sometimes STROKE don't work fine
1755 diacritical [0xD8] = diacritical [0xF8] = 0x21;
1756 diacritical [0x141] = diacritical [0x142] = 0x1F;
1758 diacritical [0xAA] = diacritical [0xBA] = 3;
1759 diacritical [0xD0] = diacritical [0xF0] = 0x68;
1760 diacritical [0x131] = 3;
1761 diacritical [0x138] = 3;
1762 // TOPBAR does not work as an identifier for the weight
1763 diacritical [0x182] = diacritical [0x183] = 0x68; // B
1764 diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
1766 diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
1768 diacritical [0x184] = diacritical [0x185] = 0x87;
1770 diacritical [0x190] = diacritical [0x25B] = 0x7B;
1771 // There are many letters w/ diacritical weight 0x7B
1772 diacritical [0x0192] = diacritical [0x0194] =
1773 diacritical [0x0195] = diacritical [0x0196] =
1774 diacritical [0x019C] = diacritical [0x019E] =
1775 diacritical [0x01A6] = diacritical [0x01B1] =
1776 diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
1777 // ... as well as 0x7C
1778 diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
1780 // <font> NFKD characters seem to have diacritical
1781 // weight as 3,4,5... but the order does not look
1782 // by codepoint and I have no idea how they are sorted.
1783 diacritical [0x210E] = 3;
1784 diacritical [0x210F] = 0x68;
1785 diacritical [0x2110] = 4;
1786 diacritical [0x2111] = 5;
1787 diacritical [0x2112] = 4;
1788 diacritical [0x2113] = 4;
1789 diacritical [0x211B] = 4;
1790 diacritical [0x211C] = 5;
1792 // some cyrillic diacritical weight. They seem to be
1793 // based on old character names, so it's quicker to
1794 // set them directly here.
1795 // FIXME: they are by mostly unknown reason
1796 diacritical [0x0496] = diacritical [0x0497] = 7;
1797 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1798 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1799 diacritical [0x049C] = diacritical [0x049D] = 9;
1800 diacritical [0x049E] = diacritical [0x049F] = 4;
1801 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1802 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1803 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1804 diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
1805 diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
1806 diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
1807 diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
1808 diacritical [0x04B4] = diacritical [0x04B5] = 3;
1809 diacritical [0x04B6] = 8;
1810 diacritical [0x04B7] = 7;
1811 diacritical [0x04B8] = diacritical [0x04B9] = 9;
1812 diacritical [0x04BA] = diacritical [0x04BB] = 9;
1814 // number, secondary weights
1816 int [] numarr = numberSecondaryWeightBounds;
1817 for (int i = 0; i < numarr.Length; i += 2, weight++)
1818 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1819 if (Char.IsNumber ((char) cp))
1820 diacritical [cp] = weight;
1822 // Gurmukhi special letters' diacritical weight
1823 for (int i = 0x0A50; i < 0x0A60; i++)
1824 diacritical [i] = 4;
1825 // Oriya special letters' diacritical weight
1826 for (int i = 0x0B5C; i < 0x0B60; i++)
1827 diacritical [i] = 6;
1829 // Update name part of named characters
1830 for (int i = 0; i < sortableCharNames.Count; i++) {
1831 DictionaryEntry de =
1832 (DictionaryEntry) sortableCharNames [i];
1833 int cp = (int) de.Key;
1834 string renamed = null;
1836 case 0x2101: renamed = "A_1"; break;
1837 case 0x33C3: renamed = "A_2"; break;
1838 case 0x2105: renamed = "C_1"; break;
1839 case 0x2106: renamed = "C_2"; break;
1840 case 0x211E: renamed = "R1"; break;
1841 case 0x211F: renamed = "R2"; break;
1842 // Remove some of them!
1853 sortableCharNames.RemoveAt (i);
1857 if (renamed != null)
1858 sortableCharNames [i] =
1859 new DictionaryEntry (cp, renamed);
1863 void GenerateCore ()
1867 #region Specially ignored // 01
1868 // This will raise "Defined" flag up.
1869 // FIXME: Check If it is really fine. Actually for
1870 // Japanese voice marks this code does remapping.
1871 foreach (char c in specialIgnore)
1872 map [(int) c] = new CharMapEntry (0, 0, 0);
1875 #region Extenders (FF FF)
1876 fillIndex [0xFF] = 0xFF;
1877 char [] specialBiggest = new char [] {
1878 '\u3005', '\u3031', '\u3032', '\u309D',
1879 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1880 '\uFE7C', '\uFE7D', '\uFF70'};
1881 foreach (char c in specialBiggest)
1882 AddCharMap (c, 0xFF, 0);
1885 #region Variable weights
1886 // Controls : 06 03 - 06 3D
1887 fillIndex [0x6] = 3;
1888 for (int i = 0; i < 65536; i++) {
1889 if (IsIgnorable (i))
1892 uc = Char.GetUnicodeCategory (c);
1893 // NEL is whitespace but not ignored here.
1894 if (uc == UnicodeCategory.Control &&
1895 !Char.IsWhiteSpace (c) || c == '\u0085')
1896 AddCharMap (c, 6, 1);
1900 fillIndex [0x6] = 0x80;
1901 AddCharMap ('\'', 6, 0);
1902 AddCharMap ('\uFF07', 6, 1);
1903 AddCharMap ('\uFE63', 6, 1);
1905 // SPECIAL CASE: fill FE32 here in prior to be added
1906 // at 2013. Windows does not always respect NFKD.
1907 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1909 // Hyphen/Dash : 06 81 - 06 90
1910 for (int i = 0; i < char.MaxValue; i++) {
1911 if (!IsIgnorable (i) &&
1912 Char.GetUnicodeCategory ((char) i) ==
1913 UnicodeCategory.DashPunctuation) {
1914 AddCharMapGroup2 ((char) i, 6, 1, 0);
1916 // SPECIAL: add 2027 and 2043
1917 // Maybe they are regarded the
1918 // same hyphens in "central"
1920 AddCharMap ('\u2027', 6, 1);
1921 AddCharMap ('\u2043', 6, 1);
1925 // They are regarded as primarily equivalent to '-'
1926 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1927 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1928 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1930 // Arabic variable weight chars 06 A0 -
1931 fillIndex [6] = 0xA0;
1933 for (int i = 0x64B; i <= 0x650; i++)
1934 AddArabicCharMap ((char) i, 6, 1, 0);
1936 AddCharMapGroup ('\u0652', 6, 1, 0);
1938 AddCharMapGroup ('\u0651', 6, 1, 0);
1942 #region Nonspacing marks // 01
1943 // FIXME: 01 03 - 01 B6 ... annoyance :(
1945 // Combining diacritical marks: 01 DC -
1947 fillIndex [0x1] = 0x41;
1948 for (int i = 0x030E; i <= 0x0326; i++)
1949 if (!IsIgnorable (i))
1950 AddCharMap ((char) i, 0x1, 1);
1951 for (int i = 0x0329; i <= 0x0334; i++)
1952 if (!IsIgnorable (i))
1953 AddCharMap ((char) i, 0x1, 1);
1955 for (int i = 0x0339; i <= 0x0341; i++)
1956 if (!IsIgnorable (i))
1957 AddCharMap ((char) i, 0x1, 1);
1958 fillIndex [0x1] = 0x74;
1959 for (int i = 0x0346; i <= 0x0348; i++)
1960 if (!IsIgnorable (i))
1961 AddCharMap ((char) i, 0x1, 1);
1962 for (int i = 0x02BE; i <= 0x02BF; i++)
1963 if (!IsIgnorable (i))
1964 AddCharMap ((char) i, 0x1, 1);
1965 for (int i = 0x02C1; i <= 0x02C5; i++)
1966 if (!IsIgnorable (i))
1967 AddCharMap ((char) i, 0x1, 1);
1968 for (int i = 0x02CE; i <= 0x02CF; i++)
1969 if (!IsIgnorable (i))
1970 AddCharMap ((char) i, 0x1, 1);
1972 for (int i = 0x02D1; i <= 0x02D3; i++)
1973 if (!IsIgnorable (i))
1974 AddCharMap ((char) i, 0x1, 1);
1975 AddCharMap ('\u02DE', 0x1, 1);
1976 for (int i = 0x02E4; i <= 0x02E9; i++)
1977 if (!IsIgnorable (i))
1978 AddCharMap ((char) i, 0x1, 1);
1981 // FIXME: needs more love here (it should eliminate
1982 // all the hacky code above).
1983 for (int i = 0x0300; i < 0x0370; i++)
1984 if (!IsIgnorable (i) && diacritical [i] != 0
1985 && !map [i].Defined)
1986 map [i] = new CharMapEntry (
1987 0x1, 0x1, diacritical [i]);
1989 // Cyrillic and Armenian nonspacing mark
1990 fillIndex [0x1] = 0x94;
1991 for (int i = 0x400; i < 0x580; i++)
1992 if (!IsIgnorable (i) &&
1993 Char.GetUnicodeCategory ((char) i) ==
1994 UnicodeCategory.NonSpacingMark)
1995 AddCharMap ((char) i, 1, 1);
1997 fillIndex [0x1] = 0x8D;
1998 // syriac dotted nonspacing marks (1)
1999 AddCharMap ('\u0740', 0x1, 1);
2000 AddCharMap ('\u0741', 0x1, 1);
2001 AddCharMap ('\u0742', 0x1, 1);
2002 // syriac oblique nonspacing marks
2003 AddCharMap ('\u0747', 0x1, 1);
2004 AddCharMap ('\u0748', 0x1, 1);
2005 // syriac dotted nonspacing marks (2)
2006 fillIndex [0x1] = 0x94; // this reset is mandatory
2007 AddCharMap ('\u0732', 0x1, 1);
2008 AddCharMap ('\u0735', 0x1, 1);
2009 AddCharMap ('\u0738', 0x1, 1);
2010 AddCharMap ('\u0739', 0x1, 1);
2011 AddCharMap ('\u073C', 0x1, 1);
2012 // SPECIAL CASES: superscripts
2013 AddCharMap ('\u073F', 0x1, 1);
2014 AddCharMap ('\u0711', 0x1, 1);
2016 for (int i = 0x0743; i <= 0x0746; i++)
2017 AddCharMap ((char) i, 0x1, 1);
2018 for (int i = 0x0730; i <= 0x0780; i++)
2019 if (!map [i].Defined &&
2020 Char.GetUnicodeCategory ((char) i) ==
2021 UnicodeCategory.NonSpacingMark)
2022 AddCharMap ((char) i, 0x1, 1);
2024 // LAMESPEC: It should not stop at '\u20E1'. There are
2025 // a few more characters (that however results in
2026 // overflow of level 2 unless we start before 0xDD).
2027 fillIndex [0x1] = 0xDD;
2028 for (int i = 0x20D0; i <= 0x20DC; i++)
2029 AddCharMap ((char) i, 0x1, 1);
2030 fillIndex [0x1] = 0xEC;
2031 for (int i = 0x20DD; i <= 0x20E1; i++)
2032 AddCharMap ((char) i, 0x1, 1);
2033 fillIndex [0x1] = 0x4;
2034 AddCharMap ('\u0CD5', 0x1, 1);
2035 AddCharMap ('\u0CD6', 0x1, 1);
2036 AddCharMap ('\u093C', 0x1, 1);
2037 for (int i = 0x302A; i <= 0x302D; i++)
2038 AddCharMap ((char) i, 0x1, 1);
2039 AddCharMap ('\u0C55', 0x1, 1);
2040 AddCharMap ('\u0C56', 0x1, 1);
2042 fillIndex [0x1] = 0x50; // I wonder how they are sorted
2043 for (int i = 0x02D4; i <= 0x02D7; i++)
2044 AddCharMap ((char) i, 0x1, 1);
2046 // They are not part of Nonspacing marks, but have
2047 // only diacritical weight.
2048 for (int i = 0x3099; i <= 0x309C; i++)
2049 map [i] = new CharMapEntry (1, 1, 1);
2050 map [0xFF9E] = new CharMapEntry (1, 1, 1);
2051 map [0xFF9F] = new CharMapEntry (1, 1, 2);
2052 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
2053 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
2054 for (int i = 0x30FC; i <= 0x30FE; i++)
2055 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
2057 fillIndex [0x1] = 0xA;
2058 for (int i = 0x0951; i <= 0x0954; i++)
2059 AddCharMap ((char) i, 0x1, 2);
2064 #region Whitespaces // 07 03 -
2065 fillIndex [0x7] = 0x2;
2066 AddCharMap (' ', 0x7, 2);
2067 AddCharMap ('\u00A0', 0x7, 1);
2068 for (int i = 9; i <= 0xD; i++)
2069 AddCharMap ((char) i, 0x7, 1);
2070 for (int i = 0x2000; i <= 0x200B; i++)
2071 AddCharMap ((char) i, 0x7, 1);
2073 fillIndex [0x7] = 0x17;
2074 AddCharMapGroup ('\u2028', 0x7, 1, 0);
2075 AddCharMapGroup ('\u2029', 0x7, 1, 0);
2077 // Characters which used to represent layout control.
2078 // LAMESPEC: Windows developers seem to have thought
2079 // that those characters are kind of whitespaces,
2080 // while they aren't.
2081 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
2082 AddCharMap ('\u2423', 0x7, 1, 0); // open box
2086 // category 09 - continued symbols from 08
2087 fillIndex [0x9] = 2;
2089 for (int cp = 0x2300; cp <= 0x237A; cp++)
2090 AddCharMap ((char) cp, 0x9, 1, 0);
2093 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
2094 foreach (DictionaryEntry de in arrowValues) {
2095 int idx = (int) de.Value;
2096 int cp = (int) de.Key;
2097 if (map [cp].Defined)
2099 fillIndex [0x9] = (byte) (0xD8 + idx);
2100 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
2104 byte [] boxLv2 = new byte [128];
2105 // 0-63 will be used for those offsets are positive,
2106 // and 64-127 are for negative ones.
2107 for (int i = 0; i < boxLv2.Length; i++)
2109 foreach (DictionaryEntry de in boxValues) {
2110 int cp = (int) de.Key;
2111 int off = (int) de.Value;
2112 if (map [cp].Defined)
2115 fillIndex [0x9] = (byte) (0xE5 + off);
2116 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
2119 fillIndex [0x9] = (byte) (0xE5 + off);
2120 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
2123 // Some special characters (slanted)
2124 fillIndex [0x9] = 0xF4;
2125 AddCharMap ('\u2571', 0x9, 3);
2126 AddCharMap ('\u2572', 0x9, 3);
2127 AddCharMap ('\u2573', 0x9, 3);
2129 // FIXME: implement 0A
2131 fillIndex [0xA] = 2;
2132 // byte currency symbols
2133 for (int cp = 0; cp < 0x100; cp++) {
2134 uc = Char.GetUnicodeCategory ((char) cp);
2135 if (!IsIgnorable (cp) &&
2136 uc == UnicodeCategory.CurrencySymbol &&
2138 AddCharMapGroup ((char) cp, 0xA, 1, 0);
2140 // byte other symbols
2141 for (int cp = 0; cp < 0x100; cp++) {
2143 continue; // SPECIAL: skip FIXME: why?
2144 uc = Char.GetUnicodeCategory ((char) cp);
2145 if (!IsIgnorable (cp) &&
2146 uc == UnicodeCategory.OtherSymbol ||
2147 cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
2148 AddCharMapGroup ((char) cp, 0xA, 1, 0);
2151 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
2153 for (int cp = 0x2020; cp <= 0x2031; cp++)
2154 if (Char.IsPunctuation ((char) cp))
2155 AddCharMap ((char) cp, 0xA, 1, 0);
2156 // SPECIAL CASES: why?
2157 AddCharMap ('\u203B', 0xA, 1, 0);
2158 AddCharMap ('\u2040', 0xA, 1, 0);
2159 AddCharMap ('\u2041', 0xA, 1, 0);
2160 AddCharMap ('\u2042', 0xA, 1, 0);
2162 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
2163 AddCharMap ((char) cp, 0xA, 1, 0);
2165 // 3004 is skipped at first...
2166 for (int cp = 0x3010; cp <= 0x3040; cp++)
2167 if (Char.IsSymbol ((char) cp))
2168 AddCharMap ((char) cp, 0xA, 1, 0);
2169 // SPECIAL CASES: added here
2170 AddCharMap ('\u3004', 0xA, 1, 0);
2171 AddCharMap ('\u327F', 0xA, 1, 0);
2173 for (int cp = 0x2600; cp <= 0x2613; cp++)
2174 AddCharMap ((char) cp, 0xA, 1, 0);
2176 for (int cp = 0x2620; cp <= 0x2770; cp++)
2177 if (Char.IsSymbol ((char) cp))
2178 AddCharMap ((char) cp, 0xA, 1, 0);
2180 for (int i = 0x2440; i < 0x2460; i++)
2181 AddCharMap ((char) i, 0xA, 1, 0);
2183 // SPECIAL CASES: why?
2184 AddCharMap ('\u0E3F', 0xA, 1, 0);
2185 AddCharMap ('\u2117', 0xA, 1, 0);
2186 AddCharMap ('\u20AC', 0xA, 1, 0);
2189 #region Numbers // 0C 02 - 0C E1
2190 fillIndex [0xC] = 2;
2192 // 9F8 : Bengali "one less than the denominator"
2193 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2195 ArrayList numbers = new ArrayList ();
2196 for (int i = 0; i < 65536; i++)
2197 if (!IsIgnorable (i) &&
2198 Char.IsNumber ((char) i) &&
2199 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2202 ArrayList numberValues = new ArrayList ();
2203 foreach (int i in numbers)
2204 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2205 // SPECIAL CASE: Cyrillic Thousand sign
2206 numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2207 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2209 //foreach (DictionaryEntry de in numberValues)
2210 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2212 // FIXME: fillIndex adjustment lines are too
2213 // complicated. It must be simpler.
2214 decimal prevValue = -1;
2215 foreach (DictionaryEntry de in numberValues) {
2216 int cp = (int) de.Key;
2217 decimal currValue = (decimal) de.Value;
2218 bool addnew = false;
2219 if (prevValue < currValue &&
2220 prevValue - (int) prevValue == 0 &&
2224 // Process Hangzhou and Roman numbers
2226 // There are some SPECIAL cases.
2227 if (currValue != 4) // no increment for 4
2231 if (currValue <= 13) {
2235 if (currValue == 11)
2236 AddCharMap ('\u0BF0', 0xC, 1);
2237 xcp = (int) prevValue + 0x2160 - 1;
2238 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2239 xcp = (int) prevValue + 0x2170 - 1;
2240 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2245 if (currValue <= 10) {
2246 xcp = (int) prevValue + 0x3021 - 1;
2247 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2251 if (prevValue < currValue)
2252 prevValue = currValue;
2253 if (map [cp].Defined)
2255 // HangZhou and Roman are add later
2257 if (0x3021 <= cp && cp < 0x302A
2258 || 0x2160 <= cp && cp < 0x216C
2259 || 0x2170 <= cp && cp < 0x217C)
2262 if (cp == 0x215B) // FIXME: why?
2263 fillIndex [0xC] += 2;
2264 else if (cp == 0x3021) // FIXME: why?
2266 if (addnew || cp <= '9') {
2267 int mod = (int) currValue - 1;
2269 if (1 <= currValue && currValue <= 11) {
2271 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2273 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2275 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2277 if (1 <= currValue && currValue <= 20) {
2279 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2281 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2283 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2286 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2288 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2291 // Maybe Bengali digit numbers do not increase
2292 // indexes, but 0x09E6 does.
2293 case 0x09E7: case 0x09E8: case 0x09E9:
2296 case 0x0BF0: case 0x2180: case 0x2181:
2303 if (currValue < 11 || currValue == 1000)
2308 // Add special cases that are not regarded as
2309 // numbers in UnicodeCategory speak.
2312 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2313 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2315 else if (cp == '2' || cp == '6') // FIXME: why?
2320 fillIndex [0xC] = 0xFF;
2321 AddCharMap ('\u221E', 0xC, 1);
2324 #region Letters and NonSpacing Marks (general)
2326 // ASCII Latin alphabets
2327 for (int i = 0; i < alphabets.Length; i++)
2328 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2330 // non-ASCII Latin alphabets
2331 // FIXME: there is no such characters that are placed
2332 // *after* "alphabets" array items. This is nothing
2333 // more than a hack that creates dummy weight for
2334 // primary characters.
2335 for (int i = 0x0080; i < 0x0300; i++) {
2336 if (!Char.IsLetter ((char) i))
2338 // For those Latin Letters which has NFKD are
2339 // not added as independent primary character.
2340 if (decompIndex [i] != 0)
2343 // 1.some alphabets have primarily
2344 // equivalent ASCII alphabets.
2345 // 2.some have independent primary weights,
2346 // but inside a-to-z range.
2347 // 3.there are some expanded characters that
2348 // are not part of Unicode Standard NFKD.
2349 // 4. some characters are letter in IsLetter
2350 // but not in sortkeys (maybe unicode version
2351 // difference caused it).
2353 // 1. skipping them does not make sense
2354 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2355 // case 0x184: case 0x185: case 0x186: case 0x189:
2356 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2357 // case 0x194: case 0x195: case 0x196: case 0x19A:
2358 // case 0x19B: case 0x19C:
2359 // 2. skipping them does not make sense
2360 // case 0x14A: // Ng
2361 // case 0x14B: // ng
2365 case 0xDE: // Icelandic Thorn
2366 case 0xFE: // Icelandic Thorn
2367 case 0xDF: // German ss
2368 case 0xFF: // German ss
2370 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2371 // not classified yet
2372 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2373 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2374 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2378 AddCharMapGroup ((char) i, 0xE, 1, 0);
2382 fillIndex [0xF] = 2;
2383 for (int i = 0x0391; i < 0x03AA; i++)
2385 AddCharMap ((char) i, 0xF, 1);
2386 fillIndex [0xF] = 2;
2387 for (int i = 0x03B1; i < 0x03CA; i++)
2389 AddCharMap ((char) i, 0xF, 1);
2391 map [0x03C2] = new CharMapEntry (0xF,
2392 map [0x03C3].Level1, map [0x03C3].Level2);
2394 fillIndex [0xF] = 0x40;
2395 for (int i = 0x03DA; i < 0x03F0; i++)
2396 AddCharMap ((char) i, 0xF,
2397 (byte) (i % 2 == 0 ? 0 : 2));
2400 for (int i = 0x0386; i <= 0x0400; i++)
2401 FillLetterNFKD (i, true, true);
2404 // Cyrillic letters are sorted like Latin letters i.e.
2405 // containing culture-specific letters between the
2406 // standard Cyrillic sequence.
2408 // We can't use UCA here; it has different sorting.
2409 char [] orderedCyrillic = new char [] {
2410 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2411 '\u0452', // DJE for Serbocroatian
2413 '\u0454', // IE for Ukrainian
2417 '\u0456', // Byelorussian-Ukrainian I
2427 '\u043F', '\u0440', '\u0441', '\u0442',
2428 '\u045B', // TSHE for Serbocroatian
2430 '\u045E', // Short U for Byelorussian
2431 '\u04B1', // Straight U w/ stroke (diacritical!)
2432 '\u0444', '\u0445', '\u0446', '\u0447',
2434 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2435 '\u044D', '\u044E', '\u044F'};
2437 // For some characters here is a map to basic cyrillic
2438 // letters. See UnicodeData.txt character names for
2439 // the sources. Here I simply declare an equiv. array.
2440 // The content characters are map from U+490(,491),
2441 // skipping small letters.
2442 char [] cymap_src = new char [] {
2443 '\u0433', '\u0433', '\u0433', '\u0436',
2444 '\u0437', '\u043A', '\u043A', '\u043A',
2445 '\u043A', '\u043D', '\u043D', '\u043F',
2446 '\u0445', '\u0441', '\u0442', '\u0443',
2447 '\u0443', '\u0445', '\u0446', '\u0447',
2448 '\u0447', '\u0432', '\u0435', '\u0435',
2449 '\u0406', '\u0436', '\u043A', '\u043D',
2450 '\u0447', '\u0435'};
2452 fillIndex [0x10] = 0x8D;
2453 for (int i = 0x0460; i < 0x0481; i++) {
2454 if (Char.IsLetter ((char) i)) {
2456 // U+476/477 have the same
2457 // primary weight as U+474/475.
2458 fillIndex [0x10] -= 3;
2459 AddLetterMap ((char) i, 0x10, 3);
2463 fillIndex [0x10] = 0x6;
2464 for (int i = 0; i < orderedCyrillic.Length; i++) {
2465 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2466 if (!IsIgnorable ((int) c) &&
2467 Char.IsLetter (c) &&
2469 AddLetterMap (c, 0x10, 0);
2470 fillIndex [0x10] += 3;
2475 for (int i = 0x0401; i <= 0x045F; i++)
2476 FillLetterNFKD (i, false, false);
2478 for (int i = 0; i < cymap_src.Length; i++) {
2479 char c = cymap_src [i];
2480 fillIndex [0x10] = map [c].Level1;
2481 int c2 = 0x0490 + i * 2;
2482 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2486 fillIndex [0x11] = 0x3;
2487 fillIndex [0x1] = 0x98;
2488 for (int i = 0x0531; i < 0x0586; i++) {
2489 if (i == 0x0559 || i == 0x55A)
2490 AddCharMap ((char) i, 1, 1);
2491 if (Char.IsLetter ((char) i))
2492 AddLetterMap ((char) i, 0x11, 1);
2497 fillIndex [0x12] = 0x2;
2498 for (int i = 0x05D0; i < 0x05FF; i++)
2499 if (Char.IsLetter ((char) i)) {
2500 if (isUppercase [i]) {
2502 AddLetterMap ((char) i, 0x12, 2);
2505 AddLetterMap ((char) i, 0x12, 1);
2508 fillIndex [0x1] = 0x3;
2509 for (int i = 0x0591; i <= 0x05C2; i++) {
2510 if (i == 0x05A3 || i == 0x05BB)
2513 AddCharMap ((char) i, 0x1, 1);
2517 fillIndex [0x1] = 0x8E;
2518 fillIndex [0x13] = 0x3;
2519 for (int i = 0x0621; i <= 0x064A; i++) {
2521 if (Char.GetUnicodeCategory ((char) i)
2522 != UnicodeCategory.OtherLetter) {
2523 // FIXME: arabic nonspacing marks are
2524 // in different order.
2525 AddCharMap ((char) i, 0x1, 1);
2528 // map [i] = new CharMapEntry (0x13,
2529 // (byte) arabicLetterPrimaryValues [i], 1);
2531 (byte) arabicLetterPrimaryValues [i];
2532 byte formDiacritical = 8; // default
2535 case 0x0622: formDiacritical = 9; break;
2536 case 0x0623: formDiacritical = 0xA; break;
2537 case 0x0624: formDiacritical = 5; break;
2538 case 0x0625: formDiacritical = 0xB; break;
2539 case 0x0626: formDiacritical = 7; break;
2540 case 0x0649: formDiacritical = 5; break;
2541 case 0x064A: formDiacritical = 7; break;
2543 // AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2544 AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
2546 for (int i = 0x0670; i < 0x0673; i++)
2547 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2548 fillIndex [0x13] = 0x84;
2549 for (int i = 0x0674; i < 0x06D6; i++)
2550 if (Char.IsLetter ((char) i))
2551 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2555 // FIXME: this could be fixed in more decent way
2556 for (int i = 0x0958; i <= 0x095F; i++)
2557 diacritical [i] = 8;
2559 // FIXME: it does seem straight codepoint mapping.
2560 fillIndex [0x14] = 04;
2561 for (int i = 0x0901; i < 0x0905; i++)
2562 if (!IsIgnorable (i))
2563 AddLetterMap ((char) i, 0x14, 2);
2564 fillIndex [0x14] = 0xB;
2565 for (int i = 0x0905; i < 0x093A; i++) {
2567 AddCharMap ('\u0929', 0x14, 0, 8);
2569 AddCharMap ('\u0931', 0x14, 0, 8);
2571 AddCharMap ('\u0934', 0x14, 0, 8);
2572 if (Char.IsLetter ((char) i))
2573 AddLetterMap ((char) i, 0x14, 4);
2575 AddCharMap ('\u0960', 0x14, 4);
2577 AddCharMap ('\u0961', 0x14, 4);
2579 fillIndex [0x14] = 0xDA;
2580 for (int i = 0x093E; i < 0x0945; i++)
2581 if (!IsIgnorable (i))
2582 AddLetterMap ((char) i, 0x14, 2);
2583 fillIndex [0x14] = 0xEC;
2584 for (int i = 0x0945; i < 0x094F; i++)
2585 if (!IsIgnorable (i))
2586 AddLetterMap ((char) i, 0x14, 2);
2590 fillIndex [0x15] = 02;
2591 for (int i = 0x0980; i < 0x9FF; i++) {
2592 if (IsIgnorable (i))
2595 fillIndex [0x15] = 0x3B;
2596 switch (Char.GetUnicodeCategory ((char) i)) {
2597 case UnicodeCategory.NonSpacingMark:
2598 case UnicodeCategory.DecimalDigitNumber:
2599 case UnicodeCategory.OtherNumber:
2602 AddLetterMap ((char) i, 0x15, 1);
2605 fillIndex [0x1] = 0x3;
2606 for (int i = 0x0981; i < 0x0A00; i++)
2607 if (Char.GetUnicodeCategory ((char) i) ==
2608 UnicodeCategory.NonSpacingMark)
2609 AddCharMap ((char) i, 0x1, 1);
2611 // Gurmukhi. orderedGurmukhi is from UCA
2612 // FIXME: it does not look equivalent to UCA.
2613 fillIndex [0x16] = 04;
2614 fillIndex [0x1] = 3;
2615 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2616 char c = orderedGurmukhi [i];
2617 if (IsIgnorable ((int) c))
2619 if (IsIgnorableNonSpacing (c)) {
2620 AddLetterMap (c, 0x1, 1);
2623 if (c == '\u0A3C' || c == '\u0A4D' ||
2624 '\u0A66' <= c && c <= '\u0A71')
2629 case '\u0A33': case '\u0A36': case '\u0A16':
2630 case '\u0A17': case '\u0A5B': case '\u0A5E':
2634 if (c == '\u0A3E') // Skip
2635 fillIndex [0x16] = 0xC0;
2636 AddLetterMap (c, 0x16, shift);
2639 // Gujarati. orderedGujarati is from UCA
2640 fillIndex [0x17] = 0x4;
2642 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2643 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2644 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2645 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2646 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2647 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2648 // letters go first.
2649 for (int i = 0; i < orderedGujarati.Length; i++) {
2651 char c = orderedGujarati [i];
2652 if (Char.IsLetter (c)) {
2654 if (c == '\u0AB3' || c == '\u0A32')
2656 if (c == '\u0A33') {
2657 AddCharMap ('\u0A32', 0x17, 0);
2658 AddCharMap ('\u0A33', 0x17, 4, 4);
2662 AddCharMap ('\u0AE0', 0x17, 0, 5);
2663 AddCharMap (c, 0x17, 4);
2666 AddCharMap ('\u0AB3', 0x17, 6);
2670 byte gujaratiShift = 4;
2671 fillIndex [0x17] = 0xC0;
2672 for (int i = 0; i < orderedGujarati.Length; i++) {
2673 char c = orderedGujarati [i];
2674 if (fillIndex [0x17] == 0xCC)
2676 if (!Char.IsLetter (c)) {
2679 AddCharMap ('\u0A81', 0x17, 2);
2682 AddLetterMap (c, 0x17, gujaratiShift);
2687 fillIndex [0x1] = 03;
2688 fillIndex [0x18] = 02;
2689 for (int i = 0x0B00; i < 0x0B7F; i++) {
2690 switch (Char.GetUnicodeCategory ((char) i)) {
2691 case UnicodeCategory.NonSpacingMark:
2692 case UnicodeCategory.DecimalDigitNumber:
2693 AddLetterMap ((char) i, 0x1, 1);
2696 AddLetterMapCore ((char) i, 0x18, 1, 0, true);
2700 fillIndex [0x19] = 2;
2701 AddCharMap ('\u0BD7', 0x19, 0);
2702 fillIndex [0x19] = 0xA;
2704 for (int i = 0x0B82; i <= 0x0B94; i++)
2705 if (!IsIgnorable ((char) i))
2706 AddCharMap ((char) i, 0x19, 2);
2708 fillIndex [0x19] = 0x28;
2709 // The array for Tamil consonants is a constant.
2710 // Windows have almost similar sequence to TAM from
2711 // tamilnet but a bit different in Grantha.
2712 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2713 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2715 fillIndex [0x19] = 0x82;
2716 for (int i = 0x0BBE; i < 0x0BCD; i++)
2717 if (Char.GetUnicodeCategory ((char) i) ==
2718 UnicodeCategory.SpacingCombiningMark
2720 AddLetterMap ((char) i, 0x19, 2);
2723 fillIndex [0x1A] = 0x4;
2724 for (int i = 0x0C00; i < 0x0C62; i++) {
2725 if (i == 0x0C55 || i == 0x0C56)
2727 AddCharMap ((char) i, 0x1A, 3);
2728 char supp = (i == 0x0C0B) ? '\u0C60':
2729 i == 0x0C0C ? '\u0C61' : char.MinValue;
2730 if (supp == char.MinValue)
2732 AddCharMap (supp, 0x1A, 3);
2736 fillIndex [0x1B] = 4;
2737 for (int i = 0x0C80; i < 0x0CE5; i++) {
2738 if (i == 0x0CD5 || i == 0x0CD6)
2740 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2741 continue; // shift after 0xCB9
2742 AddCharMap ((char) i, 0x1B, 3);
2744 // SPECIAL CASES: but why?
2745 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2746 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2747 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2750 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2754 fillIndex [0x1C] = 2;
2755 fillIndex [0x1] = 3;
2756 for (int i = 0x0D02; i < 0x0D61; i++) {
2757 // FIXME: I avoided MSCompatUnicodeTable usage
2758 // here (it results in recursion). So check if
2759 // using NonSpacingMark makes sense or not.
2760 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2761 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2762 AddCharMap ((char) i, 0x1C, 1);
2763 else if (!IsIgnorable ((char) i))
2764 AddCharMap ((char) i, 1, 1);
2767 // Thai ... note that it breaks 0x1E wall after E2B!
2768 // Also, all Thai characters have level 2 value 3.
2769 fillIndex [0x1E] = 2;
2770 fillIndex [0x1] = 3;
2771 for (int i = 0xE40; i <= 0xE44; i++)
2772 AddCharMap ((char) i, 0x1E, 1, 3);
2773 for (int i = 0xE01; i < 0xE2B; i++)
2774 AddCharMap ((char) i, 0x1E, 6, 3);
2775 fillIndex [0x1F] = 5;
2776 for (int i = 0xE2B; i < 0xE30; i++)
2777 AddCharMap ((char) i, 0x1F, 6, 3);
2778 fillIndex [0x1F] = 0x1E;
2779 for (int i = 0xE30; i < 0xE3B; i++)
2780 AddCharMap ((char) i, 0x1F, 1, 3);
2781 // some Thai characters remains.
2782 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2783 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2784 foreach (char c in specialThai)
2785 AddCharMap (c, 0x1F, 1, 3);
2787 for (int i = 0xE00; i < 0xE80; i++)
2788 if (Char.GetUnicodeCategory ((char) i) ==
2789 UnicodeCategory.NonSpacingMark)
2790 AddCharMap ((char) i, 1, 1);
2793 fillIndex [0x1F] = 2;
2794 fillIndex [0x1] = 3;
2795 for (int i = 0xE80; i < 0xEDF; i++) {
2796 if (IsIgnorable ((char) i))
2798 else if (Char.IsLetter ((char) i))
2799 AddCharMap ((char) i, 0x1F, 1);
2800 else if (Char.GetUnicodeCategory ((char) i) ==
2801 UnicodeCategory.NonSpacingMark)
2802 AddCharMap ((char) i, 1, 1);
2805 // Georgian. orderedGeorgian is from UCA DUCET.
2806 fillIndex [0x21] = 5;
2807 for (int i = 0; i < orderedGeorgian.Length; i++) {
2808 char c = orderedGeorgian [i];
2809 if (map [(int) c].Defined)
2811 AddCharMap (c, 0x21, 0);
2813 AddCharMap ((char) (c - 0x30), 0x21, 0);
2814 fillIndex [0x21] += 5;
2818 fillIndex [0x22] = 2;
2819 int kanaOffset = 0x3041;
2820 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2822 for (int gyo = 0; gyo < 9; gyo++) {
2823 for (int dan = 0; dan < 5; dan++) {
2824 if (gyo == 7 && dan % 2 == 1) {
2827 kanaOffset -= 2; // There is no space for yi and ye.
2830 int cp = kanaOffset + dan * kanaLines [gyo];
2831 // small lines (a-gyo, ya-gyo)
2832 if (gyo == 0 || gyo == 7) {
2833 AddKanaMap (cp, 1); // small
2834 AddKanaMap (cp + 1, 1);
2837 AddKanaMap (cp, kanaLines [gyo]);
2841 // add small 'ka' (before normal one)
2842 AddKanaMap (0x30F5, 1);
2846 // add small 'ke' (before normal one)
2847 AddKanaMap (0x30F6, 1);
2851 // add small 'Tsu' (before normal one)
2852 AddKanaMap (0x3063, 1);
2856 fillIndex [0x22] += 3;
2857 kanaOffset += 5 * kanaLines [gyo];
2860 // Wa-gyo is almost special, so I just manually add.
2861 AddLetterMap ((char) 0x308E, 0x22, 0);
2862 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2863 AddLetterMap ((char) 0x308F, 0x22, 0);
2864 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2866 AddLetterMap ((char) 0x3090, 0x22, 0);
2867 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2868 fillIndex [0x22] += 2;
2869 // no "Wu" in Japanese.
2870 AddLetterMap ((char) 0x3091, 0x22, 0);
2871 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2873 AddLetterMap ((char) 0x3092, 0x22, 0);
2874 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2876 fillIndex [0x22] = 0x80;
2877 AddLetterMap ((char) 0x3093, 0x22, 0);
2878 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2880 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2881 map [0x30A6].Level1, 3);// voiced hiragana U
2882 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2883 map [0x30A6].Level1, 3);// voiced katakana U
2885 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2886 map [0x30AB].Level1, 0);// small katakana Ka
2887 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2888 map [0x30B1].Level1, 0);// small katakana Ke
2890 for (int i = 0x30F7; i < 0x30FB; i++)
2891 map [i] = new CharMapEntry (map [i - 8].Category,
2895 // JIS Japanese square chars.
2896 fillIndex [0x22] = 0x97;
2897 jisJapanese.Sort (JISComparer.Instance);
2898 foreach (JISCharacter j in jisJapanese)
2899 if (0x3300 <= j.CP && j.CP <= 0x3357)
2900 AddCharMap ((char) j.CP, 0x22, 1);
2901 // non-JIS Japanese square chars.
2902 nonJisJapanese.Sort (NonJISComparer.Instance);
2903 foreach (NonJISCharacter j in nonJisJapanese)
2904 AddCharMap ((char) j.CP, 0x22, 1);
2907 fillIndex [0x23] = 0x02;
2908 for (int i = 0x3105; i <= 0x312C; i++)
2909 AddCharMap ((char) i, 0x23, 1);
2911 // Estrangela: ancient Syriac
2912 fillIndex [0x24] = 0x0B;
2913 // FIXME: is 0x71E really alternative form?
2914 ArrayList syriacAlternatives = new ArrayList (
2915 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2916 for (int i = 0x0710; i <= 0x072C; i++) {
2917 if (i == 0x0711) // NonSpacingMark
2919 if (syriacAlternatives.Contains (i))
2921 AddCharMap ((char) i, 0x24, 4);
2926 foreach (int cp in syriacAlternatives)
2927 map [cp] = new CharMapEntry (0x24,
2928 (byte) (map [cp - 1].Level1 + 2),
2930 // FIXME: Syriac NonSpacingMark should go here.
2933 // FIXME: it turned out that it does not look like UCA
2934 fillIndex [0x24] = 0x6E;
2935 fillIndex [0x1] = 0xAC;
2936 for (int i = 0; i < orderedThaana.Length; i++) {
2937 char c = orderedThaana [i];
2938 if (IsIgnorableNonSpacing ((int) c))
2939 AddCharMap (c, 1, 1);
2940 AddCharMap (c, 0x24, 2);
2941 if (c == '\u0782') // SPECIAL CASE: why?
2942 fillIndex [0x24] += 2;
2946 // FIXME: Add more culture-specific letters (that are
2947 // not supported in Windows collation) here.
2949 // Surrogate ... they are computed.
2954 // Unlike UCA Windows Hangul sequence mixes Jongseong
2955 // with Choseong sequence as well as Jungseong,
2956 // adjusted to have the same primary weight for the
2957 // same base character. So it is impossible to compute
2960 // Here I introduce an ordered sequence of mixed
2961 // 'commands' and 'characters' that is similar to
2963 // - ',' increases primary weight.
2964 // - [A B] means a range, increasing index
2965 // - {A B} means a range, without increasing index
2966 // - '=' is no operation (it means the characters
2967 // of both sides have the same weight).
2968 // - '>' inserts a Hangul Syllable block that
2969 // contains 0x251 characters.
2970 // - '<' decreases the index
2971 // - '0'-'9' means skip count
2972 // - whitespaces are ignored
2975 string hangulSequence =
2976 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2977 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2978 + "<{\u1113 \u1116}, \u3165,"
2979 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2980 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2981 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2982 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2983 + "[\u11D1 \u11D2], \u11B2,"
2984 + "[\u11D3 \u11D5], \u11B3,"
2985 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2986 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2987 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2988 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2989 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2990 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2991 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2992 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2993 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2994 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2995 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2996 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2997 + "\u11F1,, \u11F2,,,"
2998 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2999 + "<\u114D, \u110D,, >"
3000 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
3001 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
3002 + "\u1110=\u11C0 > \u1111=\u11C1 >"
3003 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
3004 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
3008 byte hangulCat = 0x52;
3009 fillIndex [hangulCat] = 0x2;
3011 int syllableBlock = 0;
3012 for (int n = 0; n < hangulSequence.Length; n++) {
3013 char c = hangulSequence [n];
3015 if (Char.IsWhiteSpace (c))
3021 IncrementSequentialIndex (ref hangulCat);
3024 if (fillIndex [hangulCat] == 2)
3025 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
3026 fillIndex [hangulCat]--;
3029 IncrementSequentialIndex (ref hangulCat);
3030 for (int l = 0; l < 0x15; l++)
3031 for (int v = 0; v < 0x1C; v++) {
3033 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
3034 IncrementSequentialIndex (ref hangulCat);
3039 start = hangulSequence [n + 1];
3040 end = hangulSequence [n + 3];
3041 for (int i = start; i <= end; i++) {
3042 AddCharMap ((char) i, hangulCat, 0);
3044 IncrementSequentialIndex (ref hangulCat);
3046 n += 4; // consumes 5 characters for this operation
3049 start = hangulSequence [n + 1];
3050 end = hangulSequence [n + 3];
3051 for (int i = start; i <= end; i++)
3052 AddCharMap ((char) i, hangulCat, 0);
3053 n += 4; // consumes 5 characters for this operation
3056 AddCharMap (c, hangulCat, 0);
3062 for (int i = 0x3200; i < 0x3300; i++) {
3063 if (IsIgnorable (i) || map [i].Defined)
3067 if (decompLength [i] == 4 &&
3068 decompValues [decompIndex [i]] == '(')
3069 ch = decompIndex [i] + 1;
3071 else if (decompLength [i] == 2 &&
3072 decompValues [decompIndex [i] + 1] == '\u1161')
3073 ch = decompIndex [i];
3074 else if (decompLength [i] == 1)
3075 ch = decompIndex [i];
3078 ch = decompValues [ch];
3079 if (ch < 0x1100 || 0x1200 < ch &&
3080 ch < 0xAC00 || 0xD800 < ch)
3084 int offset = i < 0x3260 ? 1 : 0;
3085 if (0x326E <= i && i <= 0x3273)
3088 map [i] = new CharMapEntry (map [ch].Category,
3089 (byte) (map [ch].Level1 + offset),
3091 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
3097 // Letterlike characters and CJK compatibility square
3098 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
3099 int [] counts = new int ['Z' - 'A' + 1];
3100 char [] namedChars = new char [sortableCharNames.Count];
3102 foreach (DictionaryEntry de in sortableCharNames) {
3103 counts [((string) de.Value) [0] - 'A']++;
3104 namedChars [nCharNames++] = (char) ((int) de.Key);
3106 nCharNames = 0; // reset
3107 for (int a = 0; a < counts.Length; a++) {
3108 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
3109 for (int i = 0; i < counts [a]; i++)
3110 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
3111 AddCharMap (namedChars [nCharNames++], 0xE, 1);
3114 // CJK unified ideograph.
3116 fillIndex [cjkCat] = 0x2;
3117 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
3118 if (!IsIgnorable (cp))
3119 AddCharMapGroupCJK ((char) cp, ref cjkCat);
3120 // CJK Extensions goes here.
3121 // LAMESPEC: With this Windows style CJK layout, it is
3122 // impossible to add more CJK ideograph i.e. 0x9FA6-
3123 // 0x9FBB can never be added w/o breaking compat.
3124 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
3125 if (!IsIgnorable (cp))
3126 AddCharMapGroupCJK ((char) cp, ref cjkCat);
3128 // PrivateUse ... computed.
3129 // remaining Surrogate ... computed.
3131 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
3132 // non-alphanumeric ASCII except for: + - < = > '
3133 for (int i = 0x21; i < 0x7F; i++) {
3134 // SPECIAL CASE: 02C6 looks regarded as
3135 // equivalent to '^', which does not conform
3136 // to Unicode standard character database.
3138 AddCharMap ('\u2045', 0x7, 0, 0x1C);
3140 AddCharMap ('\u2046', 0x7, 0, 0x1C);
3142 AddCharMap ('\u02C6', 0x7, 0, 3);
3144 AddCharMap ('\u02CB', 0x7, 0, 3);
3146 if (Char.IsLetterOrDigit ((char) i)
3147 || "+-<=>'".IndexOf ((char) i) >= 0)
3148 continue; // they are not added here.
3150 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3151 // Insert 3001 after ',' and 3002 after '.'
3153 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
3155 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
3157 AddCharMap ('\uFE30', 0x7, 1, 0);
3161 #region 07 - Punctuations and something else
3162 for (int i = 0xA0; i < char.MaxValue; i++) {
3163 if (IsIgnorable (i))
3166 // FIXME: actually those reset should not be
3167 // done but here I put for easy goal.
3171 fillIndex [0x7] = 0xE2;
3173 fillIndex [0x7] = 0x77;
3175 fillIndex [0x7] = 0x93;
3177 if (0x02C8 <= i && i <= 0x02CD)
3178 continue; // nonspacing marks
3180 // SPECIAL CASE: maybe they could be allocated
3181 // dummy NFKD mapping and no special processing
3182 // would be required here.
3184 AddCharMap ('\u02C9', 0x7, 0, 3);
3186 AddCharMap ('\u02CA', 0x7, 0, 3);
3188 AddCharMap ('\u02D8', 0x7, 0, 3);
3202 switch (Char.GetUnicodeCategory ((char) i)) {
3203 case UnicodeCategory.OtherPunctuation:
3204 case UnicodeCategory.ClosePunctuation:
3205 case UnicodeCategory.OpenPunctuation:
3206 case UnicodeCategory.ConnectorPunctuation:
3207 case UnicodeCategory.InitialQuotePunctuation:
3208 case UnicodeCategory.FinalQuotePunctuation:
3209 case UnicodeCategory.ModifierSymbol:
3210 // SPECIAL CASES: // 0xA
3211 if (0x2020 <= i && i <= 0x2031)
3213 if (i == 0x3003) // added later
3215 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3218 if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3219 goto case UnicodeCategory.OtherPunctuation;
3225 // FIXME: it should not need to reset level 1, but
3226 // it's for easy goal.
3227 fillIndex [0x7] = 0xB6;
3228 for (int i = 0x2400; i <= 0x2424; i++)
3229 AddCharMap ((char) i, 0x7, 1, 0);
3231 // FIXME: what are they?
3232 AddCharMap ('\u3003', 0x7, 1);
3233 AddCharMap ('\u3006', 0x7, 1);
3234 AddCharMap ('\u02D0', 0x7, 1);
3235 AddCharMap ('\u10FB', 0x7, 1);
3236 AddCharMap ('\u0950', 0x7, 1);
3237 AddCharMap ('\u093D', 0x7, 1);
3238 AddCharMap ('\u0964', 0x7, 1);
3239 AddCharMap ('\u0965', 0x7, 1);
3240 AddCharMap ('\u0970', 0x7, 1);
3244 #region category 08 - symbols
3245 fillIndex [0x8] = 2;
3246 // Here Windows mapping is not straightforward. It is
3247 // not based on computation but seems manual sorting.
3248 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3249 AddCharMapGroup ('\u2212', 0x8, 1); // minus
3250 AddCharMapGroup ('\u229D', 0x8, 1); // minus
3251 AddCharMapGroup ('\u2297', 0x8, 1); // mul
3252 AddCharMapGroup ('\u2044', 0x8, 1); // div
3253 AddCharMapGroup ('\u2215', 0x8, 0); // div
3254 AddCharMapGroup ('\u2298', 0x8, 1); // div slash
3255 AddCharMapGroup ('\u2217', 0x8, 0); // mul
3256 AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
3257 AddCharMapGroup ('\u2218', 0x8, 0); // ring
3258 AddCharMapGroup ('\u229A', 0x8, 1); // ring
3259 AddCharMapGroup ('\u2219', 0x8, 0); // bullet
3260 AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
3261 AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
3262 AddCharMapGroup ('\u003C', 0x8, 1); // <
3263 AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
3264 AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
3266 for (int cp = 0; cp < 0x2300; cp++) {
3267 if (cp == 0xAC) // SPECIAL CASE: skip
3270 cp = 0x2200; // skip to 2200
3271 fillIndex [0x8] = 0x21;
3274 fillIndex [0x8] = 0x3;
3276 fillIndex [0x8] = 0xAB;
3278 fillIndex [0x8] = 0xB9;
3279 if (!map [cp].Defined &&
3280 // Char.GetUnicodeCategory ((char) cp) ==
3281 // UnicodeCategory.MathSymbol)
3282 Char.IsSymbol ((char) cp))
3283 AddCharMapGroup ((char) cp, 0x8, 1);
3284 // SPECIAL CASES: no idea why Windows sorts as such
3287 AddCharMap ('\u227B', 0x8, 1, 0);
3288 AddCharMap ('\u22B1', 0x8, 1, 0);
3291 AddCharMapGroup ('\u00AB', 0x8, 1);
3292 AddCharMapGroup ('\u226A', 0x8, 1);
3293 AddCharMapGroup ('\u00BB', 0x8, 1);
3294 AddCharMapGroup ('\u226B', 0x8, 1);
3297 AddCharMap ('\u01C0', 0x8, 1, 0);
3298 AddCharMap ('\u01C1', 0x8, 1, 0);
3299 AddCharMap ('\u01C2', 0x8, 1, 0);
3307 // Characters w/ diacritical marks (NFKD)
3308 for (int i = 0; i <= char.MaxValue; i++) {
3309 if (map [i].Defined || IsIgnorable (i))
3311 if (decompIndex [i] == 0)
3314 int start = decompIndex [i];
3315 int primaryChar = decompValues [start];
3316 int secondary = diacritical [i];
3318 int length = decompLength [i];
3319 // special processing for parenthesized ones.
3321 decompValues [start] == '(' &&
3322 decompValues [start + 2] == ')') {
3323 primaryChar = decompValues [start + 1];
3327 if (map [primaryChar].Level1 == 0)
3330 for (int l = 1; l < length; l++) {
3331 int c = decompValues [start + l];
3332 if (map [c].Level1 != 0)
3334 secondary += diacritical [c];
3338 map [i] = new CharMapEntry (
3339 map [primaryChar].Category,
3340 map [primaryChar].Level1,
3345 // Diacritical weight adjustment
3348 diacritical [0x624] = 0x5;
3349 diacritical [0x626] = 0x7;
3350 diacritical [0x622] = 0x9;
3351 diacritical [0x623] = 0xA;
3352 diacritical [0x625] = 0xB;
3353 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3354 diacritical [0x64A] = 0x7; // Yaa'
3356 for (int i = 0; i < char.MaxValue; i++) {
3358 byte cat = map [i].Category;
3360 case 0xE: // Latin diacritics
3361 case 0x22: // Japanese: circled characters
3362 mod = diacritical [i];
3364 case 0x13: // Arabic
3367 if (diacritical [i] == 0 && decompLength [i] != 0)
3368 diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
3369 if (diacritical [i] == 0 && i >= 0xFE8D)
3370 mod = 0x8; // default for arabic
3373 if (0x52 <= cat && cat <= 0x7F) // Hangul
3374 mod = diacritical [i];
3376 map [i] = new CharMapEntry (
3377 cat, map [i].Level1, mod);
3380 // FIXME: this is halfly hack but those NonSpacingMark
3381 // characters and still undefined are likely to
3383 for (int i = 0; i < char.MaxValue; i++) {
3384 if (map [i].Defined ||
3393 if (Char.GetUnicodeCategory ((char) i) !=
3394 UnicodeCategory.NonSpacingMark)
3398 if (diacritical [i] != 0)
3399 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3401 AddCharMap ((char) i, 1, 1);
3407 TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
3409 private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
3411 if (map [i].Defined)
3413 int up = (int) ti.ToUpper ((char) i);
3414 if (checkUpper && map [up].Category == 0xF) {
3417 FillLetterNFKD (up, checkUpper, greekRemap);
3418 map [i] = new CharMapEntry (0xF,
3422 int idx = decompIndex [i];
3425 int primary = decompValues [decompIndex [i]];
3426 FillLetterNFKD (primary, checkUpper, greekRemap);
3428 int lv2 = map [primary].Level2;
3430 for (int l = 1; l < decompLength [i]; l++) {
3431 int tmp = decompValues [idx + l];
3432 if (map [tmp].Category != 1)
3434 if (greekRemap && map [tmp].Level2 == 0xC)
3437 off += map [tmp].Level2;
3444 map [i] = new CharMapEntry (
3445 map [primary].Category,
3446 map [primary].Level1,
3451 private void IncrementSequentialIndex (ref byte hangulCat)
3453 fillIndex [hangulCat]++;
3454 if (fillIndex [hangulCat] == 0) { // overflown
3456 fillIndex [hangulCat] = 0x2;
3460 // Reset fillIndex to fixed value and call AddLetterMap().
3461 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3463 fillIndex [category] = alphaWeight;
3464 AddLetterMap (c, category, 0);
3466 ArrayList al = latinMap [c] as ArrayList;
3470 foreach (int cp in al)
3471 AddLetterMap ((char) cp, category, 0);
3474 private void AddKanaMap (int i, byte voices)
3476 for (byte b = 0; b < voices; b++) {
3477 char c = (char) (i + b);
3478 byte arg = (byte) (b > 0 ? b + 2 : 0);
3480 AddLetterMapCore (c, 0x22, 0, arg, false);
3482 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3486 private void AddLetterMap (char c, byte category, byte updateCount)
3488 AddLetterMapCore (c, category, updateCount, 0, true);
3491 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3494 // <small> updates index
3495 c2 = ToSmallForm (c);
3497 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3498 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3499 if (c2 != c && !map [(int) c2].Defined)
3500 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3501 bool doUpdate = true;
3502 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3505 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3507 fillIndex [category] += updateCount;
3510 private bool AddCharMap (char c, byte category, byte increment)
3512 return AddCharMap (c, category, increment, 0);
3515 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3517 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3518 return false; // do nothing
3519 map [(int) c] = new CharMapEntry (category,
3520 category == 1 ? alt : fillIndex [category],
3521 category == 1 ? fillIndex [category] : alt);
3522 fillIndex [category] += increment;
3527 // Adds characters to table in the order below
3528 // (+ increases weight):
3532 // <full> | <super> | <sub>
3533 // <circle> | <wide> (| <narrow>)
3537 // level2 is fixed (does not increase).
3538 int [] sameWeightItems = new int [] {
3539 DecompositionFraction,
3543 DecompositionCircle,
3545 DecompositionNarrow,
3547 private void AddCharMapGroup (char c, byte category, byte updateCount)
3549 AddCharMapGroup (c, category, updateCount, 0, true);
3552 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3554 AddCharMapGroup (c, category, updateCount, level2, false);
3557 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3559 if (map [(int) c].Defined)
3563 level2 = diacritical [(int) c];
3565 char small = char.MinValue;
3566 char vertical = char.MinValue;
3567 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3569 object smv = nfkd [(byte) DecompositionSmall];
3571 small = (char) ((int) smv);
3572 object vv = nfkd [(byte) DecompositionVertical];
3574 vertical = (char) ((int) vv);
3577 // <small> updates index
3578 if (small != char.MinValue) {
3579 if (level2 == 0 && deferLevel2)
3580 level2 = diacritical [small];
3581 AddCharMap (small, category, updateCount, level2);
3585 AddCharMap (c, category, 0, level2);
3588 foreach (int weight in sameWeightItems) {
3589 object wv = nfkd [(byte) weight];
3592 level2 = diacritical [(int) wv];
3593 AddCharMap ((char) ((int) wv), category, 0, level2);
3598 // update index here.
3599 fillIndex [category] += updateCount;
3601 if (vertical != char.MinValue) {
3602 if (level2 == 0 && deferLevel2)
3603 level2 = diacritical [vertical];
3604 AddCharMap (vertical, category, updateCount, level2);
3608 private void AddCharMapCJK (char c, ref byte category)
3610 AddCharMap (c, category, 0, 0);
3611 IncrementSequentialIndex (ref category);
3613 // Special. I wonder why but Windows skips 9E F9.
3614 if (category == 0x9E && fillIndex [category] == 0xF9)
3615 IncrementSequentialIndex (ref category);
3618 private void AddCharMapGroupCJK (char c, ref byte category)
3620 AddCharMapCJK (c, ref category);
3622 // LAMESPEC: see below.
3623 if (c == '\u5B78') {
3624 AddCharMapCJK ('\u32AB', ref category);
3625 AddCharMapCJK ('\u323B', ref category);
3627 if (c == '\u52DE') {
3628 AddCharMapCJK ('\u3298', ref category);
3629 AddCharMapCJK ('\u3238', ref category);
3632 AddCharMapCJK ('\u32A2', ref category);
3634 // Especially this mapping order totally does
3635 // not make sense to me.
3636 AddCharMapCJK ('\u32A9', ref category);
3638 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3641 for (byte weight = 0; weight <= 0x12; weight++) {
3642 object wv = nfkd [weight];
3647 // Special: they are ignored in this area.
3648 // FIXME: check if it is sane
3649 if (0xF900 <= w && w <= 0xFAD9)
3651 // LAMESPEC: on Windows some of CJK characters
3652 // in 3200-32B0 are incorrectly mapped. They
3653 // mix Chinise and Japanese Kanji when
3654 // ordering those characters.
3656 case 0x32A2: case 0x3298: case 0x3238:
3657 case 0x32A9: case 0x323B: case 0x32AB:
3661 AddCharMapCJK ((char) w, ref category);
3665 // For now it is only for 0x7 category.
3666 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3668 if (map [(int) c].Defined)
3671 bool updateWeight = false;
3672 // Process in advance (lower primary weight)
3673 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3674 if (!map [c2].Defined &&
3675 decompLength [c2] == 1 &&
3676 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3677 switch (decompType [c2]) {
3678 case DecompositionSmall:
3679 updateWeight = true;
3680 AddCharMap ((char) c2, category,
3687 fillIndex [category] = (byte)
3688 (fillIndex [category] + updateCount);
3691 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3692 if (!map [c2].Defined &&
3693 decompLength [c2] == 1 &&
3694 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3695 switch (decompType [c2]) {
3696 case DecompositionSub:
3697 case DecompositionSuper:
3698 case DecompositionWide:
3699 case DecompositionNarrow:
3700 AddCharMap ((char) c2, category,
3708 AddCharMap (c, category, updateCount, level2);
3710 // Since nfkdMap is problematic to have two or more
3711 // NFKD to an identical character, here I iterate all.
3712 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3713 if (!map [c2].Defined &&
3714 decompLength [c2] == 1 &&
3715 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3716 switch (decompType [c2]) {
3717 case DecompositionWide:
3718 case DecompositionNarrow:
3719 case DecompositionSmall:
3720 case DecompositionSub:
3721 case DecompositionSuper:
3724 AddCharMap ((char) c2, category, updateCount, level2);
3731 private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
3734 AddCharMap (c, category, 0, level2);
3736 // Since nfkdMap is problematic to have two or more
3737 // NFKD to an identical character, here I iterate all.
3738 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3739 if (decompLength [c2] == 0)
3741 int idx = decompIndex [c2] + decompLength [c2] - 1;
3742 if ((int) (decompValues [idx]) == (int) c)
3743 AddCharMap ((char) c2, category,
3746 fillIndex [category] += updateCount;
3749 char ToSmallForm (char c)
3751 return ToDecomposed (c, DecompositionSmall, false);
3754 char ToDecomposed (char c, byte d, bool tail)
3756 if (decompType [(int) c] != d)
3758 int idx = decompIndex [(int) c];
3760 idx += decompLength [(int) c] - 1;
3761 return (char) decompValues [idx];
3764 bool ExistsJIS (int cp)
3766 foreach (JISCharacter j in jisJapanese)
3774 #region Level 3 properties (Case/Width)
3776 private byte ComputeLevel3Weight (char c)
3778 byte b = ComputeLevel3WeightRaw (c);
3779 return b > 0 ? (byte) (b + 2) : b;
3782 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3785 if ('\u3192' <= c && c <= '\u319F')
3788 // They have <narrow> NFKD mapping, and on Windows
3789 // those narrow characters are regarded as "normal",
3790 // thus those characters themselves are regarded as
3791 // "wide". grep "<narrow>" and you can pick them up
3792 // (ignoring Kana, Hangul etc.)
3809 if ('\u11A8' <= c && c <= '\u11F9')
3811 if ('\uFFA0' <= c && c <= '\uFFDC')
3813 if ('\u3130' <= c && c <= '\u3164')
3815 if ('\u3165' <= c && c <= '\u318E')
3817 // Georgian Capital letters
3818 if ('\u10A0' <= c && c <= '\u10C5')
3821 if ('\u2776' <= c && c <= '\u277F')
3823 if ('\u2780' <= c && c <= '\u2789')
3825 if ('\u2776' <= c && c <= '\u2793')
3827 if ('\u2160' <= c && c <= '\u216F')
3829 if ('\u2181' <= c && c <= '\u2182')
3832 if ('\u2135' <= c && c <= '\u2138')
3834 // I believe that Windows has a bug on setting level 3
3835 // weight here. NFKD results in different values.
3836 if ('\uFE80' < c && c < '\uFF00') {
3837 // 2(Isolated)/8(Final)/0x18(Medial)
3838 switch (decompType [(int) c]) {
3839 case DecompositionIsolated:
3841 case DecompositionFinal:
3843 case DecompositionMedial:
3845 case DecompositionInitial:
3850 // I have no idea why those symbols have level 3 weight
3851 if (c == '\u2104' || c == '\u212B')
3853 if ('\u211E' <= c && c <= '\u212B')
3856 // actually I dunno the reason why they have weights.
3885 switch (decompType [(int) c]) {
3886 case DecompositionWide: // <wide>
3887 case DecompositionSub: // <sub>
3888 case DecompositionSuper: // <super>
3889 ret |= decompType [(int) c];
3892 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3894 if (isUppercase [(int) c]) // DerivedCoreProperties
3904 static bool IsIgnorable (int i)
3906 if (unicodeAge [i] >= 3.1)
3908 switch (char.GetUnicodeCategory ((char) i)) {
3909 case UnicodeCategory.OtherNotAssigned:
3910 case UnicodeCategory.Format:
3917 // FIXME: In the future use DerivedAge.txt to examine character
3918 // versions and set those ones that have higher version than
3919 // 1.0 as ignorable.
3920 static bool IsIgnorable (int i)
3924 // I guess, those characters are added between
3925 // Unicode 1.0 (LCMapString) and Unicode 3.1
3926 // (UnicodeCategory), so they used to be
3927 // something like OtherNotAssigned as of Unicode 1.1.
3928 case 0x2df: case 0x387:
3929 case 0x3d7: case 0x3d8: case 0x3d9:
3930 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3931 case 0x400: case 0x40d: case 0x450: case 0x45d:
3932 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3933 case 0x653: case 0x654: case 0x655: case 0x66d:
3935 case 0x1e9b: case 0x202f: case 0x20ad:
3936 case 0x20ae: case 0x20af:
3937 case 0x20e2: case 0x20e3:
3938 case 0x2139: case 0x213a: case 0x2183:
3939 case 0x2425: case 0x2426: case 0x2619:
3940 case 0x2670: case 0x2671: case 0x3007:
3941 case 0x3190: case 0x3191:
3942 case 0xfffc: case 0xfffd:
3944 // exceptional characters filtered by the
3945 // following conditions. Originally those exceptional
3946 // ranges are incorrect (they should not be ignored)
3947 // and most of those characters are unfortunately in
3949 case 0x4d8: case 0x4d9:
3950 case 0x4e8: case 0x4e9:
3952 case 0x3036: case 0x303f:
3953 case 0x337b: case 0xfb1e:
3958 // The whole Sinhala characters.
3959 0x0D82 <= i && i <= 0x0DF4
3960 // The whole Tibetan characters.
3961 || 0x0F00 <= i && i <= 0x0FD1
3962 // The whole Myanmar characters.
3963 || 0x1000 <= i && i <= 0x1059
3964 // The whole Etiopic, Cherokee,
3965 // Canadian Syllablic, Ogham, Runic,
3966 // Tagalog, Hanunoo, Philippine,
3967 // Buhid, Tagbanwa, Khmer and Mongorian
3969 || 0x1200 <= i && i <= 0x1DFF
3970 // Greek extension characters.
3971 || 0x1F00 <= i && i <= 0x1FFF
3972 // The whole Braille characters.
3973 || 0x2800 <= i && i <= 0x28FF
3974 // CJK radical characters.
3975 || 0x2E80 <= i && i <= 0x2EF3
3976 // Kangxi radical characters.
3977 || 0x2F00 <= i && i <= 0x2FD5
3978 // Ideographic description characters.
3979 || 0x2FF0 <= i && i <= 0x2FFB
3980 // Bopomofo letter and final
3981 || 0x31A0 <= i && i <= 0x31B7
3982 // White square with quadrant characters.
3983 || 0x25F0 <= i && i <= 0x25F7
3984 // Ideographic telegraph symbols.
3985 || 0x32C0 <= i && i <= 0x32CB
3986 || 0x3358 <= i && i <= 0x3370
3987 || 0x33E0 <= i && i <= 0x33FF
3988 // The whole YI characters.
3989 || 0xA000 <= i && i <= 0xA48C
3990 || 0xA490 <= i && i <= 0xA4C6
3991 // American small ligatures
3992 || 0xFB13 <= i && i <= 0xFB17
3993 // hebrew, arabic, variation selector.
3994 || 0xFB1D <= i && i <= 0xFE2F
3995 // Arabic ligatures.
3996 || 0xFEF5 <= i && i <= 0xFEFC
3997 // FIXME: why are they excluded?
3998 || 0x01F6 <= i && i <= 0x01F9
3999 || 0x0218 <= i && i <= 0x0233
4000 || 0x02A9 <= i && i <= 0x02AD
4001 || 0x02EA <= i && i <= 0x02EE
4002 || 0x0349 <= i && i <= 0x036F
4003 || 0x0488 <= i && i <= 0x048F
4004 || 0x04D0 <= i && i <= 0x04FF
4005 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
4006 || 0x06D6 <= i && i <= 0x06ED
4007 || 0x06FA <= i && i <= 0x06FE
4008 || 0x2048 <= i && i <= 0x204D
4009 || 0x20e4 <= i && i <= 0x20ea
4010 || 0x213C <= i && i <= 0x214B
4011 || 0x21EB <= i && i <= 0x21FF
4012 || 0x22F2 <= i && i <= 0x22FF
4013 || 0x237B <= i && i <= 0x239A
4014 || 0x239B <= i && i <= 0x23CF
4015 || 0x24EB <= i && i <= 0x24FF
4016 || 0x2596 <= i && i <= 0x259F
4017 || 0x25F8 <= i && i <= 0x25FF
4018 || 0x2672 <= i && i <= 0x2689
4019 || 0x2768 <= i && i <= 0x2775
4020 || 0x27d0 <= i && i <= 0x27ff
4021 || 0x2900 <= i && i <= 0x2aff
4022 || 0x3033 <= i && i <= 0x303F
4023 || 0x31F0 <= i && i <= 0x31FF
4024 || 0x3250 <= i && i <= 0x325F
4025 || 0x32B1 <= i && i <= 0x32BF
4026 || 0x3371 <= i && i <= 0x337B
4027 || 0xFA30 <= i && i <= 0xFA6A
4031 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4033 case UnicodeCategory.PrivateUse:
4034 case UnicodeCategory.Surrogate:
4036 // ignored by nature
4037 case UnicodeCategory.Format:
4038 case UnicodeCategory.OtherNotAssigned:
4045 // To check IsIgnorable sanity, try the driver below under MS.NET.
4048 public static void Main ()
4050 for (int i = 0; i <= char.MaxValue; i++)
4051 Dump (i, IsIgnorable (i));
4054 static void Dump (int i, bool ignore)
4056 switch (Char.GetUnicodeCategory ((char) i)) {
4057 case UnicodeCategory.PrivateUse:
4058 case UnicodeCategory.Surrogate:
4059 return; // check nothing
4063 string s2 = new string ((char) i, 10);
4064 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
4065 if ((ret == 0) == ignore)
4067 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
4070 #endregion // IsIgnorable
4072 #region IsIgnorableSymbol
4073 static bool IsIgnorableSymbol (int i)
4075 if (IsIgnorable (i))
4080 case 0x00b5: case 0x01C0: case 0x01C1:
4081 case 0x01C2: case 0x01C3: case 0x01F6:
4082 case 0x01F7: case 0x01F8: case 0x01F9:
4083 case 0x02D0: case 0x02EE: case 0x037A:
4084 case 0x03D7: case 0x03F3:
4085 case 0x0400: case 0x040d:
4086 case 0x0450: case 0x045d:
4087 case 0x048C: case 0x048D:
4088 case 0x048E: case 0x048F:
4089 case 0x0587: case 0x0640: case 0x06E5:
4090 case 0x06E6: case 0x06FA: case 0x06FB:
4091 case 0x06FC: case 0x093D: case 0x0950:
4092 case 0x1E9B: case 0x2139: case 0x3006:
4093 case 0x3033: case 0x3034: case 0x3035:
4094 case 0xFE7E: case 0xFE7F:
4096 case 0x16EE: case 0x16EF: case 0x16F0:
4098 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
4099 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
4100 case 0x3038: // HANGZHOU NUMERAL TEN
4101 case 0x3039: // HANGZHOU NUMERAL TWENTY
4102 case 0x303a: // HANGZHOU NUMERAL THIRTY
4108 case 0x02B9: case 0x02BA: case 0x02C2:
4109 case 0x02C3: case 0x02C4: case 0x02C5:
4110 case 0x02C8: case 0x02CC: case 0x02CD:
4111 case 0x02CE: case 0x02CF: case 0x02D2:
4112 case 0x02D3: case 0x02D4: case 0x02D5:
4113 case 0x02D6: case 0x02D7: case 0x02DE:
4114 case 0x02E5: case 0x02E6: case 0x02E7:
4115 case 0x02E8: case 0x02E9:
4116 case 0x309B: case 0x309C:
4118 case 0x055A: // American Apos
4119 case 0x05C0: // Hebrew Punct
4120 case 0x0E4F: // Thai FONGMAN
4121 case 0x0E5A: // Thai ANGKHANKHU
4122 case 0x0E5B: // Thai KHOMUT
4124 case 0x09F2: // Bengali Rupee Mark
4125 case 0x09F3: // Bengali Rupee Sign
4127 case 0x221e: // INF.
4136 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
4138 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
4139 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
4144 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4146 case UnicodeCategory.Surrogate:
4147 return false; // inconsistent
4149 case UnicodeCategory.SpacingCombiningMark:
4150 case UnicodeCategory.EnclosingMark:
4151 case UnicodeCategory.NonSpacingMark:
4152 case UnicodeCategory.PrivateUse:
4154 if (0x064B <= i && i <= 0x0652) // Arabic
4158 case UnicodeCategory.Format:
4159 case UnicodeCategory.OtherNotAssigned:
4166 // latin in a circle
4167 0x249A <= i && i <= 0x24E9
4168 || 0x2100 <= i && i <= 0x2132
4170 || 0x3196 <= i && i <= 0x31A0
4172 || 0x3200 <= i && i <= 0x321C
4174 || 0x322A <= i && i <= 0x3243
4176 || 0x3260 <= i && i <= 0x32B0
4177 || 0x32D0 <= i && i <= 0x3357
4178 || 0x337B <= i && i <= 0x33DD
4180 use = !Char.IsLetterOrDigit ((char) i);
4184 // This "Digit" rule is mystery.
4185 // It filters some symbols out.
4186 if (Char.IsLetterOrDigit ((char) i))
4188 if (Char.IsNumber ((char) i))
4190 if (Char.IsControl ((char) i)
4191 || Char.IsSeparator ((char) i)
4192 || Char.IsPunctuation ((char) i))
4194 if (Char.IsSymbol ((char) i))
4197 // FIXME: should check more
4202 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
4204 public static void Main ()
4206 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
4207 for (int i = 0; i <= char.MaxValue; i++) {
4208 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
4209 if (uc == UnicodeCategory.Surrogate)
4212 bool ret = IsIgnorableSymbol (i);
4214 string s1 = "TEST ";
4215 string s2 = "TEST " + (char) i;
4217 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
4219 if (ret != (result == 0))
4220 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
4221 ret ? "should not ignore" :
4230 static bool IsIgnorableNonSpacing (int i)
4232 if (IsIgnorable (i))
4236 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
4237 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
4238 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
4240 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
4241 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
4242 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
4243 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
4244 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
4245 case 0x0CCD: case 0x0E4E:
4249 if (0x02b9 <= i && i <= 0x02c5
4250 || 0x02cc <= i && i <= 0x02d7
4251 || 0x02e4 <= i && i <= 0x02ef
4252 || 0x20DD <= i && i <= 0x20E0
4256 if (0x064B <= i && i <= 0x00652
4257 || 0x0941 <= i && i <= 0x0948
4258 || 0x0AC1 <= i && i <= 0x0ACD
4259 || 0x0C3E <= i && i <= 0x0C4F
4260 || 0x0E31 <= i && i <= 0x0E3F
4264 return Char.GetUnicodeCategory ((char) i) ==
4265 UnicodeCategory.NonSpacingMark;
4268 // We can reuse IsIgnorableSymbol testcode
4269 // for IsIgnorableNonSpacing.
4275 public byte Category;
4277 public byte Level2; // It is always single byte.
4278 public bool Defined;
4280 public CharMapEntry (byte category, byte level1, byte level2)
4282 Category = category;
4291 public readonly int CP;
4292 public readonly int JIS;
4294 public JISCharacter (int cp, int cpJIS)
4301 class JISComparer : IComparer
4303 public static readonly JISComparer Instance =
4306 public int Compare (object o1, object o2)
4308 JISCharacter j1 = (JISCharacter) o1;
4309 JISCharacter j2 = (JISCharacter) o2;
4310 return j1.JIS - j2.JIS;
4314 class NonJISCharacter
4316 public readonly int CP;
4317 public readonly string Name;
4319 public NonJISCharacter (int cp, string name)
4326 class NonJISComparer : IComparer
4328 public static readonly NonJISComparer Instance =
4329 new NonJISComparer ();
4331 public int Compare (object o1, object o2)
4333 NonJISCharacter j1 = (NonJISCharacter) o1;
4334 NonJISCharacter j2 = (NonJISCharacter) o2;
4335 return string.CompareOrdinal (j1.Name, j2.Name);
4339 class DecimalDictionaryValueComparer : IComparer
4341 public static readonly DecimalDictionaryValueComparer Instance
4342 = new DecimalDictionaryValueComparer ();
4344 private DecimalDictionaryValueComparer ()
4348 public int Compare (object o1, object o2)
4350 DictionaryEntry e1 = (DictionaryEntry) o1;
4351 DictionaryEntry e2 = (DictionaryEntry) o2;
4352 // FIXME: in case of 0, compare decomposition categories
4353 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4356 int i1 = (int) e1.Key;
4357 int i2 = (int) e2.Key;
4362 class StringDictionaryValueComparer : IComparer
4364 public static readonly StringDictionaryValueComparer Instance
4365 = new StringDictionaryValueComparer ();
4367 private StringDictionaryValueComparer ()
4371 public int Compare (object o1, object o2)
4373 DictionaryEntry e1 = (DictionaryEntry) o1;
4374 DictionaryEntry e2 = (DictionaryEntry) o2;
4375 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4378 int i1 = (int) e1.Key;
4379 int i2 = (int) e2.Key;
4384 class UCAComparer : IComparer
4386 public static readonly UCAComparer Instance
4387 = new UCAComparer ();
4389 private UCAComparer ()
4393 public int Compare (object o1, object o2)
4395 char i1 = (char) o1;
4396 char i2 = (char) o2;
4398 int l1 = CollationElementTable.GetSortKeyCount (i1);
4399 int l2 = CollationElementTable.GetSortKeyCount (i2);
4400 int l = l1 > l2 ? l2 : l1;
4402 for (int i = 0; i < l; i++) {
4403 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4404 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4405 int v = k1.Primary - k2.Primary;
4408 v = k1.Secondary - k2.Secondary;
4411 v = k1.Thirtiary - k2.Thirtiary;
4414 v = k1.Quarternary - k2.Quarternary;
4427 ArrayList items = new ArrayList ();
4429 public Tailoring (int lcid)
4434 public Tailoring (int lcid, int alias)
4441 get { return lcid; }
4445 get { return alias; }
4448 public bool FrenchSort {
4449 get { return frenchSort; }
4450 set { frenchSort = value; }
4453 public void AddDiacriticalMap (byte target, byte replace)
4455 items.Add (new DiacriticalMap (target, replace));
4458 public void AddSortKeyMap (string source, byte [] sortkey)
4460 items.Add (new SortKeyMap (source, sortkey));
4463 public void AddReplacementMap (string source, string replace)
4465 items.Add (new ReplacementMap (source, replace));
4468 public char [] ItemToCharArray ()
4470 ArrayList al = new ArrayList ();
4471 foreach (ITailoringMap m in items)
4472 al.AddRange (m.ToCharArray ());
4473 return al.ToArray (typeof (char)) as char [];
4476 interface ITailoringMap
4478 char [] ToCharArray ();
4481 class DiacriticalMap : ITailoringMap
4483 public readonly byte Target;
4484 public readonly byte Replace;
4486 public DiacriticalMap (byte target, byte replace)
4492 public char [] ToCharArray ()
4494 char [] ret = new char [3];
4495 ret [0] = (char) 02; // kind:DiacriticalMap
4496 ret [1] = (char) Target;
4497 ret [2] = (char) Replace;
4502 class SortKeyMap : ITailoringMap
4504 public readonly string Source;
4505 public readonly byte [] SortKey;
4507 public SortKeyMap (string source, byte [] sortkey)
4513 public char [] ToCharArray ()
4515 char [] ret = new char [Source.Length + 7];
4516 ret [0] = (char) 01; // kind:SortKeyMap
4517 for (int i = 0; i < Source.Length; i++)
4518 ret [i + 1] = Source [i];
4520 for (int i = 0; i < 4; i++)
4521 ret [i + Source.Length + 2] = (char) SortKey [i];
4526 class ReplacementMap : ITailoringMap
4528 public readonly string Source;
4529 public readonly string Replace;
4531 public ReplacementMap (string source, string replace)
4537 public char [] ToCharArray ()
4539 char [] ret = new char [Source.Length + Replace.Length + 3];
4540 ret [0] = (char) 03; // kind:ReplaceMap
4542 for (int i = 0; i < Source.Length; i++)
4543 ret [pos++] = Source [i];
4546 for (int i = 0; i < Replace.Length; i++)
4547 ret [pos++] = Replace [i];