3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
38 namespace Mono.Globalization.Unicode
40 internal class MSCompatSortKeyTableGenerator
42 public static void Main (string [] args)
44 new MSCompatSortKeyTableGenerator ().Run (args);
47 const int DecompositionWide = 1; // fixed
48 const int DecompositionSub = 2; // fixed
49 const int DecompositionSmall = 3;
50 const int DecompositionIsolated = 4;
51 const int DecompositionInitial = 5;
52 const int DecompositionFinal = 6;
53 const int DecompositionMedial = 7;
54 const int DecompositionNoBreak = 8;
55 const int DecompositionVertical = 9;
56 const int DecompositionFraction = 0xA;
57 const int DecompositionFont = 0xB;
58 const int DecompositionSuper = 0xC; // fixed
59 const int DecompositionFull = 0xE;
60 const int DecompositionNarrow = 0xD;
61 const int DecompositionCircle = 0xF;
62 const int DecompositionSquare = 0x10;
63 const int DecompositionCompat = 0x11;
64 const int DecompositionCanonical = 0x12;
66 TextWriter Result = Console.Out;
68 byte [] fillIndex = new byte [256]; // by category
69 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
71 char [] specialIgnore = new char [] {
72 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
73 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
76 // FIXME: need more love (as always)
77 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
78 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
79 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
80 '\u0292', '\u01BE', '\u0298'};
81 byte [] alphaWeights = new byte [] {
82 2, 9, 0xA, 0x1A, 0x21,
83 0x23, 0x25, 0x2C, 0x32, 0x35,
84 0x36, 0x48, 0x51, 0x70, 0x7C,
85 0x7E, 0x89, 0x8A, 0x91, 0x99,
86 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
87 0xA9, 0xAA, 0xB3, 0xB4};
89 bool [] isSmallCapital = new bool [char.MaxValue + 1];
90 bool [] isUppercase = new bool [char.MaxValue + 1];
92 byte [] decompType = new byte [char.MaxValue + 1];
93 int [] decompIndex = new int [char.MaxValue + 1];
94 int [] decompLength = new int [char.MaxValue + 1];
96 decimal [] decimalValue = new decimal [char.MaxValue + 1];
98 byte [] diacritical = new byte [char.MaxValue + 1];
100 string [] diacritics = new string [] {
101 "DOUBLE VERTICAL LINE ABOVE",
102 "ABKHASIAN CHE WITH DESCENDER",
103 // LATIN, CYRILLIC etc.
104 "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
106 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
107 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
108 "WITH ACUTE;", "WITH GRAVE;",
110 "WITH DOT ABOVE;", " MIDDLE DOT;",
111 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
113 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
114 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
115 "WITH OGONEK;", "WITH CEDILLA;",
117 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
118 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
120 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
121 " DIAERESIS AND GRAVE;",
123 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
124 " MACRON AND ACUTE;",
125 " MACRON AND GRAVE;",
127 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
128 " RING ABOVE AND ACUTE",
129 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
130 " CIRCUMFLEX AND TILDE",
131 " TILDE AND DIAERESIS",
134 " CEDILLA AND BREVE",
135 " OGONEK AND MACRON",
138 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
142 " PRECEDED BY APOSTROPHE",
144 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
147 " RETROFLEX;", "DIAERESIS BELOW",
150 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
151 " BREVE BELOW;", " HORN AND GRAVE",
154 " DOT BELOW AND DOT ABOVE",
155 " RIGHT HALF RING", " HORN AND TILDE",
156 " CIRCUMFLEX AND DOT BELOW",
157 " BREVE AND DOT BELOW",
158 " DOT BELOW AND MACRON",
160 " HORN AND HOOK ABOVE",
162 // CIRCLED, PARENTHESIZED and so on
163 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
164 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
165 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
167 byte [] diacriticWeights = new byte [] {
168 // this is to pick U+30E (DOUBLE VERTICAL LINE ABOVE)
169 // before being picked as VERTICAL LINE ABOVE
171 // this is to pick ABKHASIAN CHE WITH DESCENDER before
172 // being picked as ABKHASIAN
179 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
180 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
182 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
183 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
185 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
186 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
188 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
189 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
191 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
192 0x69, 0x69, 0x6A, 0x6D, 0x6E,
194 // CIRCLED, PARENTHESIZED and so on.
195 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
199 int [] numberSecondaryWeightBounds = new int [] {
200 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
201 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
202 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
203 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
204 0xE50, 0xE60, 0xED0, 0xEE0
207 char [] orderedGurmukhi;
208 char [] orderedGujarati;
209 char [] orderedGeorgian;
210 char [] orderedThaana;
212 static readonly char [] orderedTamilConsonants = new char [] {
213 // based on traditional Tamil consonants, except for
214 // Grantha (where Microsoft breaks traditionalism).
215 // http://www.angelfire.com/empire/thamizh/padanGaL
216 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
217 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
218 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
219 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
222 // cp -> character name (only for some characters)
223 ArrayList sortableCharNames = new ArrayList ();
225 // cp -> arrow value (int)
226 ArrayList arrowValues = new ArrayList ();
228 // cp -> box value (int)
229 ArrayList boxValues = new ArrayList ();
231 // cp -> level1 value
232 Hashtable arabicLetterPrimaryValues = new Hashtable ();
235 Hashtable arabicNameMap = new Hashtable ();
237 // cp -> Hashtable [decompType] -> cp
238 Hashtable nfkdMap = new Hashtable ();
240 // Latin letter -> ArrayList [int]
241 Hashtable latinMap = new Hashtable ();
243 ArrayList jisJapanese = new ArrayList ();
244 ArrayList nonJisJapanese = new ArrayList ();
246 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
247 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
248 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
249 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
250 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
252 byte [] ignorableFlags = new byte [char.MaxValue + 1];
254 static double [] unicodeAge = new double [char.MaxValue + 1];
256 ArrayList tailorings = new ArrayList ();
258 void Run (string [] args)
260 string dirname = args.Length == 0 ? "downloaded" : args [0];
261 ParseSources (dirname);
262 Console.Error.WriteLine ("parse done.");
264 ModifyParsedValues ();
266 Console.Error.WriteLine ("generation done.");
268 Console.Error.WriteLine ("serialization done.");
270 StreamWriter sw = new StreamWriter ("agelog.txt");
271 for (int i = 0; i < char.MaxValue; i++) {
272 bool shouldBe = false;
273 switch (Char.GetUnicodeCategory ((char) i)) {
274 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
275 shouldBe = true; break;
277 if (unicodeAge [i] >= 3.1)
279 //if (IsIgnorable (i) != shouldBe)
280 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
286 byte [] CompressArray (byte [] source, CodePointIndexer i)
288 return (byte []) CodePointIndexer.CompressArray (
289 source, typeof (byte), i);
292 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
294 return (ushort []) CodePointIndexer.CompressArray (
295 source, typeof (ushort), i);
301 SerializeTailorings ();
303 byte [] categories = new byte [map.Length];
304 byte [] level1 = new byte [map.Length];
305 byte [] level2 = new byte [map.Length];
306 byte [] level3 = new byte [map.Length];
307 ushort [] widthCompat = new ushort [map.Length];
308 for (int i = 0; i < map.Length; i++) {
309 categories [i] = map [i].Category;
310 level1 [i] = map [i].Level1;
311 level2 [i] = map [i].Level2;
312 level3 [i] = ComputeLevel3Weight ((char) i);
313 // For Japanese Half-width characters, don't
314 // map widthCompat. It is IgnoreKanaType that
315 // handles those width differences.
316 if (0xFF6D <= i && i <= 0xFF9D)
318 switch (decompType [i]) {
319 case DecompositionNarrow:
320 case DecompositionWide:
321 case DecompositionSuper:
322 case DecompositionSub:
323 // they are always 1 char
324 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
330 ignorableFlags = CompressArray (ignorableFlags,
332 categories = CompressArray (categories, UUtil.Category);
333 level1 = CompressArray (level1, UUtil.Level1);
334 level2 = CompressArray (level2, UUtil.Level2);
335 level3 = CompressArray (level3, UUtil.Level3);
336 widthCompat = (ushort []) CodePointIndexer.CompressArray (
337 widthCompat, typeof (ushort), UUtil.WidthCompat);
338 cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
339 cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
340 cjkJA = CompressArray (cjkJA, UUtil.Cjk);
341 cjkKO = CompressArray (cjkKO, UUtil.Cjk);
342 cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
345 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
347 MemoryStream ms = new MemoryStream ();
348 BinaryWriter binary = new BinaryWriter (ms);
349 binary.Write (ignorableFlags.Length);
351 for (int i = 0; i < ignorableFlags.Length; i++) {
352 byte value = ignorableFlags [i];
354 Result.Write ("{0},", value);
356 Result.Write ("0x{0:X02},", value);
358 binary.Write (value);
360 if ((i & 0xF) == 0xF)
361 Result.WriteLine ("// {0:X04}",
362 UUtil.Ignorable.ToCodePoint (i - 0xF));
364 Result.WriteLine ("};");
368 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
370 binary.Write (categories.Length);
372 for (int i = 0; i < categories.Length; i++) {
373 byte value = categories [i];
375 Result.Write ("{0},", value);
377 Result.Write ("0x{0:X02},", value);
379 binary.Write (value);
381 if ((i & 0xF) == 0xF)
382 Result.WriteLine ("// {0:X04}",
383 UUtil.Category.ToCodePoint (i - 0xF));
385 Result.WriteLine ("};");
388 // Primary weight value
389 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
391 binary.Write (level1.Length);
393 for (int i = 0; i < level1.Length; i++) {
394 byte value = level1 [i];
396 Result.Write ("{0},", value);
398 Result.Write ("0x{0:X02},", value);
400 binary.Write (value);
402 if ((i & 0xF) == 0xF)
403 Result.WriteLine ("// {0:X04}",
404 UUtil.Level1.ToCodePoint (i - 0xF));
406 Result.WriteLine ("};");
410 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
412 binary.Write (level2.Length);
414 for (int i = 0; i < level2.Length; i++) {
415 byte value = level2 [i];
417 Result.Write ("{0},", value);
419 Result.Write ("0x{0:X02},", value);
421 binary.Write (value);
423 if ((i & 0xF) == 0xF)
424 Result.WriteLine ("// {0:X04}",
425 UUtil.Level2.ToCodePoint (i - 0xF));
427 Result.WriteLine ("};");
431 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
433 binary.Write (level3.Length);
435 for (int i = 0; i < level3.Length; i++) {
436 byte value = level3 [i];
438 Result.Write ("{0},", value);
440 Result.Write ("0x{0:X02},", value);
442 binary.Write (value);
444 if ((i & 0xF) == 0xF)
445 Result.WriteLine ("// {0:X04}",
446 UUtil.Level3.ToCodePoint (i - 0xF));
448 Result.WriteLine ("};");
451 // Width insensitivity mappings
452 // (for now it is more lightweight than dumping the
453 // entire NFKD table).
454 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
456 binary.Write (widthCompat.Length);
458 for (int i = 0; i < widthCompat.Length; i++) {
459 ushort value = widthCompat [i];
461 Result.Write ("{0},", value);
463 Result.Write ("0x{0:X02},", value);
465 binary.Write (value);
467 if ((i & 0xF) == 0xF)
468 Result.WriteLine ("// {0:X04}",
469 UUtil.WidthCompat.ToCodePoint (i - 0xF));
471 Result.WriteLine ("};");
474 using (FileStream fs = File.Create ("../collation.core.bin")) {
475 byte [] array = ms.ToArray ();
476 fs.Write (array, 0, array.Length);
481 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
482 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
483 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
484 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
485 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
488 void SerializeCJK (string name, ushort [] cjk, int max)
490 int offset = 0;//char.MaxValue - cjk.Length;
491 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
493 MemoryStream ms = new MemoryStream ();
494 BinaryWriter binary = new BinaryWriter (ms);
495 binary.Write (cjk.Length);
497 for (int i = 0; i < cjk.Length; i++) {
498 if (i + offset == max)
500 ushort value = cjk [i];
502 Result.Write ("{0},", value);
504 Result.Write ("0x{0:X04},", value);
506 binary.Write (value);
508 if ((i & 0xF) == 0xF)
509 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
511 Result.WriteLine ("};");
514 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
515 byte [] array = ms.ToArray ();
516 fs.Write (array, 0, array.Length);
521 void SerializeCJK (string name, byte [] cjk, int max)
523 int offset = 0;//char.MaxValue - cjk.Length;
524 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
526 MemoryStream ms = new MemoryStream ();
527 BinaryWriter binary = new BinaryWriter (ms);
529 for (int i = 0; i < cjk.Length; i++) {
530 if (i + offset == max)
532 byte value = cjk [i];
534 Result.Write ("{0},", value);
536 Result.Write ("0x{0:X02},", value);
538 binary.Write (value);
540 if ((i & 0xF) == 0xF)
541 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
543 Result.WriteLine ("};");
546 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
547 byte [] array = ms.ToArray ();
548 fs.Write (array, 0, array.Length);
553 void SerializeTailorings ()
555 Hashtable indexes = new Hashtable ();
556 Hashtable counts = new Hashtable ();
557 Result.WriteLine ("static char [] tailorings = new char [] {");
560 MemoryStream ms = new MemoryStream ();
561 BinaryWriter binary = new BinaryWriter (ms);
563 foreach (Tailoring t in tailorings) {
566 Result.Write ("/*{0}*/", t.LCID);
567 indexes.Add (t.LCID, count);
568 char [] values = t.ItemToCharArray ();
569 counts.Add (t.LCID, values.Length);
570 foreach (char c in values) {
571 Result.Write ("'\\x{0:X}', ", (int) c);
572 if (++count % 16 == 0)
573 Result.WriteLine (" // {0:X04}", count - 16);
575 binary.Write ((ushort) c);
579 Result.WriteLine ("};");
581 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
583 byte [] rawdata = ms.ToArray ();
584 ms = new MemoryStream ();
585 binary = new BinaryWriter (ms);
586 binary.Write (tailorings.Count);
588 foreach (Tailoring t in tailorings) {
589 int target = t.Alias != 0 ? t.Alias : t.LCID;
590 if (!indexes.ContainsKey (target)) {
591 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
594 int idx = (int) indexes [target];
595 int cnt = (int) counts [target];
596 bool french = t.FrenchSort;
598 foreach (Tailoring t2 in tailorings)
599 if (t2.LCID == t.LCID)
600 french = t2.FrenchSort;
601 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
603 binary.Write (t.LCID);
606 binary.Write (french);
609 Result.WriteLine ("};");
611 binary.Write ((byte) 0xFF);
612 binary.Write ((byte) 0xFF);
613 binary.Write (rawdata.Length / 2);
614 binary.Write (rawdata, 0, rawdata.Length);
617 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
618 byte [] array = ms.ToArray ();
619 fs.Write (array, 0, array.Length);
626 void ParseSources (string dirname)
629 dirname + "/UnicodeData.txt";
630 string derivedCoreProps =
631 dirname + "/DerivedCoreProperties.txt";
633 dirname + "/Scripts.txt";
635 dirname + "/CP932.TXT";
637 dirname + "/DerivedAge.txt";
638 string chXML = dirname + "/common/collation/zh.xml";
639 string jaXML = dirname + "/common/collation/ja.xml";
640 string koXML = dirname + "/common/collation/ko.xml";
642 ParseDerivedAge (derivedAge);
646 ParseJISOrder (cp932); // in prior to ParseUnidata()
647 ParseUnidata (unidata);
649 ParseDerivedCoreProperties (derivedCoreProps);
650 ParseScripts (scripts);
651 ParseCJK (chXML, jaXML, koXML);
653 ParseTailorings ("mono-tailoring-source.txt");
656 void ParseTailorings (string filename)
660 using (StreamReader sr = new StreamReader (filename)) {
662 while (sr.Peek () >= 0) {
664 ProcessTailoringLine (ref t,
665 sr.ReadLine ().Trim ());
667 } catch (Exception) {
668 Console.Error.WriteLine ("ERROR at line {0}", line);
674 // For now this is enough.
675 string ParseTailoringSourceValue (string s)
677 StringBuilder sb = new StringBuilder ();
678 for (int i = 0; i < s.Length; i++) {
679 if (i + 5 < s.Length &&
680 s [i] == '\\' && s [i + 1] == 'u') {
683 s.Substring (i + 2, 4),
684 NumberStyles.HexNumber),
691 return sb.ToString ();
694 void ProcessTailoringLine (ref Tailoring t, string s)
696 int idx = s.IndexOf ('#');
698 s = s.Substring (0, idx).Trim ();
699 if (s.Length == 0 || s [0] == '#')
702 idx = s.IndexOf ('=');
705 int.Parse (s.Substring (1, idx - 1)),
706 int.Parse (s.Substring (idx + 1)));
708 t = new Tailoring (int.Parse (s.Substring (1)));
712 if (s.StartsWith ("*FrenchSort")) {
716 string d = "*Diacritical";
717 if (s.StartsWith (d)) {
718 idx = s.IndexOf ("->");
719 t.AddDiacriticalMap (
720 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
721 NumberStyles.HexNumber),
722 byte.Parse (s.Substring (idx + 2).Trim (),
723 NumberStyles.HexNumber));
726 idx = s.IndexOf (':');
728 string source = s.Substring (0, idx).Trim ();
729 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
730 byte [] b = new byte [4];
731 for (int i = 0; i < 4; i++) {
735 b [i] = byte.Parse (l [i],
736 NumberStyles.HexNumber);
738 t.AddSortKeyMap (ParseTailoringSourceValue (source),
741 idx = s.IndexOf ('=');
743 t.AddReplacementMap (
744 ParseTailoringSourceValue (
745 s.Substring (0, idx).Trim ()),
746 ParseTailoringSourceValue (
747 s.Substring (idx + 1).Trim ()));
750 void ParseDerivedAge (string filename)
752 using (StreamReader file =
753 new StreamReader (filename)) {
754 while (file.Peek () >= 0) {
755 string s = file.ReadLine ();
756 int idx = s.IndexOf ('#');
758 s = s.Substring (0, idx);
759 idx = s.IndexOf (';');
763 string cpspec = s.Substring (0, idx);
764 idx = cpspec.IndexOf ("..");
765 NumberStyles nf = NumberStyles.HexNumber |
766 NumberStyles.AllowTrailingWhite;
767 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
768 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
769 string value = s.Substring (cpspec.Length + 1).Trim ();
772 if (cp > char.MaxValue)
775 double v = double.Parse (value);
776 for (int i = cp; i <= cpEnd; i++)
780 unicodeAge [0] = double.MaxValue; // never be supported
783 void ParseUnidata (string filename)
785 ArrayList decompValues = new ArrayList ();
786 using (StreamReader unidata =
787 new StreamReader (filename)) {
788 for (int line = 1; unidata.Peek () >= 0; line++) {
790 ProcessUnidataLine (unidata.ReadLine (), decompValues);
791 } catch (Exception) {
792 Console.Error.WriteLine ("**** At line " + line);
797 this.decompValues = (int [])
798 decompValues.ToArray (typeof (int));
801 char previousLatinTarget = char.MinValue;
802 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
804 void ProcessUnidataLine (string s, ArrayList decompValues)
806 int idx = s.IndexOf ('#');
808 s = s.Substring (0, idx);
809 idx = s.IndexOf (';');
812 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
813 string [] values = s.Substring (idx + 1).Split (';');
816 if (cp > char.MaxValue)
818 if (IsIgnorable (cp))
821 string name = values [0];
823 // SPECIAL CASE: rename some characters for diacritical
824 // remapping. FIXME: why are they different?
825 // FIXME: it's still not working.
826 if (cp == 0x018B || cp == 0x018C)
827 name = name.Replace ("TOPBAR", "STROKE");
830 if (s.IndexOf ("SMALL CAPITAL") > 0)
831 isSmallCapital [cp] = true;
833 // latin mapping by character name
834 if (s.IndexOf ("LATIN") >= 0) {
835 int lidx = s.IndexOf ("LETTER DOTLESS ");
836 int offset = lidx + 15;
838 lidx = s.IndexOf ("LETTER TURNED ");
842 lidx = s.IndexOf ("LETTER CAPITAL ");
846 lidx = s.IndexOf ("LETTER SCRIPT ");
850 lidx = s.IndexOf ("LETTER ");
853 char c = lidx > 0 ? s [offset] : char.MinValue;
854 char n = s [offset + 1];
855 char target = char.MinValue;
856 if ('A' <= c && c <= 'Z' &&
857 (n == ' ') || n == ';') {
859 // FIXME: After 'Z', I cannot reset this state.
860 previousLatinTarget = c == 'Z' ? char.MinValue : c;
863 if (s.Substring (offset).StartsWith ("ALPHA"))
865 else if (s.Substring (offset).StartsWith ("TONE SIX"))
867 else if (s.Substring (offset).StartsWith ("OPEN O"))
869 else if (s.Substring (offset).StartsWith ("SCHWA"))
871 else if (s.Substring (offset).StartsWith ("ENG"))
873 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
875 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
877 else if (s.Substring (offset).StartsWith ("TONE TWO"))
879 else if (s.Substring (offset).StartsWith ("ESH"))
882 // For remaining IPA chars, direct mapping is
885 case 0x0299: target = 'B'; break;
886 case 0x029A: target = 'E'; break;
887 case 0x029B: target = 'G'; break;
888 case 0x029C: target = 'H'; break;
889 case 0x029D: target = 'J'; break;
890 case 0x029E: target = 'K'; break;
891 case 0x029F: target = 'L'; break;
892 case 0x02A0: target = 'Q'; break;
893 case 0x02A7: target = 'T'; break;
894 case 0x02A8: target = 'T'; break;
897 if (target == char.MinValue)
898 target = previousLatinTarget;
900 if (target != char.MinValue) {
901 ArrayList entry = (ArrayList) latinMap [target];
903 entry = new ArrayList ();
904 latinMap [target] = entry;
907 // FIXME: This secondary weight is hack.
908 // They are here because they must not
909 // be identical to the corresponding
911 if (c != target && diacritical [cp] == 0) {
912 diacriticalOffset [c - 'A']++;
913 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
919 if (0x2000 <= cp && cp < 0x3000) {
921 // SPECIAL CASES. FIXME: why?
923 case 0x21C5: value = -1; break; // E2
924 case 0x261D: value = 1; break;
925 case 0x27A6: value = 3; break;
926 case 0x21B0: value = 7; break;
927 case 0x21B1: value = 3; break;
928 case 0x21B2: value = 7; break;
929 case 0x21B4: value = 5; break;
930 case 0x21B5: value = 7; break;
931 case 0x21B9: value = -1; break; // E1
932 case 0x21CF: value = 7; break;
933 case 0x21D0: value = 3; break;
935 string [] arrowTargets = new string [] {
948 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
949 s.IndexOf ("LEFTWARDS") >= 0)
951 else if (s.IndexOf ("UPWARDS") >= 0 &&
952 s.IndexOf ("DOWNWARDS") >= 0)
954 else if (s.IndexOf ("ARROW") >= 0 &&
955 s.IndexOf ("COMBINING") < 0 &&
956 s.IndexOf ("CLOCKWISE") >= 0)
957 value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
959 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
960 if (s.IndexOf (arrowTargets [i]) > 0 &&
961 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
962 s.IndexOf (" OVER") < 0
966 arrowValues.Add (new DictionaryEntry (
971 if (0x2500 <= cp && cp < 0x2600) {
972 int value = int.MinValue;
974 // up:1 down:2 right:4 left:8 vert:16 horiz:32
977 // [dr] [dl] [ur] [ul]
981 ArrayList flags = new ArrayList (new int [] {
984 4 + 2, 8 + 2, 4 + 1, 8 + 1,
985 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
986 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
987 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
989 byte [] offsets = new byte [] {
996 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
998 if (s.IndexOf (" UP") >= 0)
1000 if (s.IndexOf (" DOWN") >= 0)
1002 if (s.IndexOf (" RIGHT") >= 0)
1004 if (s.IndexOf (" LEFT") >= 0)
1006 if (s.IndexOf (" VERTICAL") >= 0)
1008 if (s.IndexOf (" HORIZONTAL") >= 0)
1011 int fidx = flags.IndexOf (flag);
1013 value = offsets [fidx];
1014 } else if (s.IndexOf ("BLOCK") >= 0) {
1015 if (s.IndexOf ("ONE EIGHTH") >= 0)
1017 else if (s.IndexOf ("ONE QUARTER") >= 0)
1019 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1021 else if (s.IndexOf ("HALF") >= 0)
1023 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1025 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1027 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1032 else if (s.IndexOf ("SHADE") >= 0)
1034 else if (s.IndexOf ("SQUARE") >= 0)
1035 value = 0xBC - 0xE5;
1036 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1037 value = 0xBE - 0xE5;
1038 else if (s.IndexOf ("RECTANGLE") >= 0)
1039 value = 0xBD - 0xE5;
1040 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1041 value = 0xBF - 0xE5;
1042 else if (s.IndexOf ("TRIANGLE") >= 0) {
1043 if (s.IndexOf ("UP-POINTING") >= 0)
1044 value = 0xC0 - 0xE5;
1045 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1046 value = 0xC1 - 0xE5;
1047 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1048 value = 0xC2 - 0xE5;
1049 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1050 value = 0xC3 - 0xE5;
1052 else if (s.IndexOf ("POINTER") >= 0) {
1053 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1054 value = 0xC4 - 0xE5;
1055 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1056 value = 0xC5 - 0xE5;
1058 else if (s.IndexOf ("DIAMOND") >= 0)
1059 value = 0xC6 - 0xE5;
1060 else if (s.IndexOf ("FISHEYE") >= 0)
1061 value = 0xC7 - 0xE5;
1062 else if (s.IndexOf ("LOZENGE") >= 0)
1063 value = 0xC8 - 0xE5;
1064 else if (s.IndexOf ("BULLSEYE") >= 0)
1065 value = 0xC9 - 0xE5;
1066 else if (s.IndexOf ("CIRCLE") >= 0) {
1067 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1068 value = 0xCA - 0xE5;
1069 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1070 value = 0xCB - 0xE5;
1072 value = 0xC9 - 0xE5;
1074 else if (s.IndexOf ("BULLET") >= 0)
1075 value = 0xCC - 0xE5;
1076 if (0x25DA <= cp && cp <= 0x25E5)
1077 value = 0xCD + cp - 0x25DA - 0xE5;
1079 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1081 case 0x2571: value = 0xF; break;
1082 case 0x2572: value = 0x10; break;
1083 case 0x2573: value = 0x11; break;
1085 if (value != int.MinValue)
1086 boxValues.Add (new DictionaryEntry (
1090 // For some characters store the name and sort later
1091 // to determine sorting.
1092 if (0x2100 <= cp && cp <= 0x213F &&
1093 Char.IsSymbol ((char) cp))
1094 sortableCharNames.Add (
1095 new DictionaryEntry (cp, name));
1096 else if (0x3380 <= cp && cp <= 0x33DD)
1097 sortableCharNames.Add (new DictionaryEntry (
1098 cp, name.Substring (7)));
1100 if (Char.GetUnicodeCategory ((char) cp) ==
1101 UnicodeCategory.MathSymbol) {
1102 if (name.StartsWith ("CIRCLED "))
1103 diacritical [cp] = 0xEE;
1104 if (name.StartsWith ("SQUARED "))
1105 diacritical [cp] = 0xEF;
1108 // diacritical weights by character name
1109 if (diacritics.Length != diacriticWeights.Length)
1110 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1111 for (int d = 0; d < diacritics.Length; d++) {
1112 if (s.IndexOf (diacritics [d]) > 0) {
1113 diacritical [cp] += diacriticWeights [d];
1114 if (s.IndexOf ("COMBINING") >= 0)
1115 diacritical [cp] -= (byte) 2;
1118 // also process "COMBINING blah" here
1119 // For now it is limited to cp < 0x0370
1120 // if (cp < 0x0300 || cp >= 0x0370)
1122 string tmp = diacritics [d].TrimEnd (';');
1123 if (tmp.IndexOf ("WITH ") == 0)
1124 tmp = tmp.Substring (4);
1125 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1127 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1131 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1133 // Two-step grep required for it.
1134 if (s.IndexOf ("FULL STOP") > 0 &&
1135 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1136 diacritical [cp] |= 0xF4;
1137 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1138 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1139 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1141 // Arabic letter name
1142 if (0x0621 <= cp && cp <= 0x064A &&
1143 Char.GetUnicodeCategory ((char) cp)
1144 == UnicodeCategory.OtherLetter) {
1145 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1150 // hamza, waw, yeh ... special cases.
1155 value = 0x77; // special cases.
1158 // Get primary letter name i.e.
1159 // XXX part of ARABIC LETTER XXX yyy
1160 // e.g. that of "TEH MARBUTA" is "TEH".
1163 // 0x0640 is special: it does
1164 // not start with ARABIC LETTER
1166 name.Substring (14);
1167 int tmpIdx = letterName.IndexOf (' ');
1168 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1169 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1170 if (arabicNameMap.ContainsKey (letterName))
1171 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1173 arabicNameMap [letterName] = cp;
1176 arabicLetterPrimaryValues [cp] = value;
1179 // Japanese square letter
1180 if (0x3300 <= cp && cp <= 0x3357)
1181 if (!ExistsJIS (cp))
1182 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1184 // normalizationType
1185 string decomp = values [4];
1186 idx = decomp.IndexOf ('<');
1188 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1190 decompType [cp] = DecompositionFull;
1193 decompType [cp] = DecompositionSub;
1196 decompType [cp] = DecompositionSuper;
1199 decompType [cp] = DecompositionSmall;
1202 decompType [cp] = DecompositionIsolated;
1205 decompType [cp] = DecompositionInitial;
1208 decompType [cp] = DecompositionFinal;
1211 decompType [cp] = DecompositionMedial;
1214 decompType [cp] = DecompositionNoBreak;
1217 decompType [cp] = DecompositionCompat;
1220 decompType [cp] = DecompositionFraction;
1223 decompType [cp] = DecompositionFont;
1226 decompType [cp] = DecompositionCircle;
1229 decompType [cp] = DecompositionSquare;
1232 decompType [cp] = DecompositionWide;
1235 decompType [cp] = DecompositionNarrow;
1238 decompType [cp] = DecompositionVertical;
1241 throw new Exception ("Support NFKD type : " + decomp);
1245 decompType [cp] = DecompositionCanonical;
1246 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1247 if (decomp.Length > 0) {
1249 string [] velems = decomp.Split (' ');
1250 int didx = decompValues.Count;
1251 decompIndex [cp] = didx;
1252 foreach (string v in velems)
1253 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1254 decompLength [cp] = velems.Length;
1256 // [decmpType] -> this_cp
1257 int targetCP = (int) decompValues [didx];
1258 // for "(x)" it specially maps to 'x' .
1259 // FIXME: check if it is sane
1260 if (velems.Length == 3 &&
1261 (int) decompValues [didx] == '(' &&
1262 (int) decompValues [didx + 2] == ')')
1263 targetCP = (int) decompValues [didx + 1];
1264 // special: 0x215F "1/"
1265 else if (cp == 0x215F)
1267 else if (velems.Length > 1 &&
1268 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1269 // skip them, except for CJK ideograph compat
1272 if (targetCP != 0) {
1273 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1274 if (entry == null) {
1275 entry = new Hashtable ();
1276 nfkdMap [targetCP] = entry;
1278 entry [(byte) decompType [cp]] = cp;
1282 if (values [5].Length > 0)
1283 decimalValue [cp] = decimal.Parse (values [5]);
1284 else if (values [6].Length > 0)
1285 decimalValue [cp] = decimal.Parse (values [6]);
1286 else if (values [7].Length > 0) {
1287 string decstr = values [7];
1288 idx = decstr.IndexOf ('/');
1289 if (cp == 0x215F) // special. "1/"
1290 decimalValue [cp] = 0x1;
1294 decimal.Parse (decstr.Substring (0, idx))
1295 / decimal.Parse (decstr.Substring (idx + 1));
1296 else if (decstr [0] == '(' &&
1297 decstr [decstr.Length - 1] == ')')
1300 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1301 else if (decstr [decstr.Length - 1] == '.')
1304 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1306 decimalValue [cp] = decimal.Parse (decstr);
1310 void ParseDerivedCoreProperties (string filename)
1313 using (StreamReader file =
1314 new StreamReader (filename)) {
1315 for (int line = 1; file.Peek () >= 0; line++) {
1317 ProcessDerivedCorePropLine (file.ReadLine ());
1318 } catch (Exception) {
1319 Console.Error.WriteLine ("**** At line " + line);
1326 void ProcessDerivedCorePropLine (string s)
1328 int idx = s.IndexOf ('#');
1330 s = s.Substring (0, idx);
1331 idx = s.IndexOf (';');
1334 string cpspec = s.Substring (0, idx);
1335 idx = cpspec.IndexOf ("..");
1336 NumberStyles nf = NumberStyles.HexNumber |
1337 NumberStyles.AllowTrailingWhite;
1338 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1339 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1340 string value = s.Substring (cpspec.Length + 1).Trim ();
1343 if (cp > char.MaxValue)
1348 for (int x = cp; x <= cpEnd; x++)
1349 isUppercase [x] = true;
1354 void ParseScripts (string filename)
1356 ArrayList gurmukhi = new ArrayList ();
1357 ArrayList gujarati = new ArrayList ();
1358 ArrayList georgian = new ArrayList ();
1359 ArrayList thaana = new ArrayList ();
1361 using (StreamReader file =
1362 new StreamReader (filename)) {
1363 while (file.Peek () >= 0) {
1364 string s = file.ReadLine ();
1365 int idx = s.IndexOf ('#');
1367 s = s.Substring (0, idx);
1368 idx = s.IndexOf (';');
1372 string cpspec = s.Substring (0, idx);
1373 idx = cpspec.IndexOf ("..");
1374 NumberStyles nf = NumberStyles.HexNumber |
1375 NumberStyles.AllowTrailingWhite;
1376 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1377 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1378 string value = s.Substring (cpspec.Length + 1).Trim ();
1381 if (cp > char.MaxValue)
1386 for (int x = cp; x <= cpEnd; x++)
1387 if (!IsIgnorable (x))
1388 gurmukhi.Add ((char) x);
1391 for (int x = cp; x <= cpEnd; x++)
1392 if (!IsIgnorable (x))
1393 gujarati.Add ((char) x);
1396 for (int x = cp; x <= cpEnd; x++)
1397 if (!IsIgnorable (x))
1398 georgian.Add ((char) x);
1401 for (int x = cp; x <= cpEnd; x++)
1402 if (!IsIgnorable (x))
1403 thaana.Add ((char) x);
1408 gurmukhi.Sort (UCAComparer.Instance);
1409 gujarati.Sort (UCAComparer.Instance);
1410 georgian.Sort (UCAComparer.Instance);
1411 thaana.Sort (UCAComparer.Instance);
1412 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1413 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1414 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1415 orderedThaana = (char []) thaana.ToArray (typeof (char));
1418 void ParseJISOrder (string filename)
1422 using (StreamReader file =
1423 new StreamReader (filename)) {
1424 for (;file.Peek () >= 0; line++)
1425 ProcessJISOrderLine (file.ReadLine ());
1427 } catch (Exception) {
1428 Console.Error.WriteLine ("---- line {0}", line);
1433 char [] ws = new char [] {'\t', ' '};
1435 void ProcessJISOrderLine (string s)
1437 int idx = s.IndexOf ('#');
1439 s = s.Substring (0, idx).Trim ();
1442 idx = s.IndexOfAny (ws);
1445 // They start with "0x" so cut them out.
1446 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1447 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1448 jisJapanese.Add (new JISCharacter (cp, jis));
1451 void ParseCJK (string zhXML, string jaXML, string koXML)
1453 XmlDocument doc = new XmlDocument ();
1454 doc.XmlResolver = null;
1461 // Chinese Simplified
1464 offset = 0;//char.MaxValue - arr.Length;
1466 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1468 foreach (char c in s) {
1470 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1472 arr [(int) c - offset] = (ushort) v++;
1478 // Chinese Traditional
1481 offset = 0;//char.MaxValue - arr.Length;
1482 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1484 foreach (char c in s) {
1486 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1488 arr [(int) c - offset] = (ushort) v++;
1497 offset = 0;//char.MaxValue - arr.Length;
1500 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1501 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1502 arr [0x337E] = 0x8005;
1503 arr [0x337D] = 0x8006;
1504 arr [0x337C] = 0x8007;
1507 foreach (JISCharacter jc in jisJapanese) {
1508 if (jc.JIS < 0x8800)
1510 char c = (char) jc.CP;
1513 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1516 arr [(int) c - offset] = (ushort) v++;
1521 if (c == '\u662D') // U+337C
1523 if (c == '\u5927') // U+337D
1525 if (c == '\u5E73') // U+337B
1527 if (c == '\u660E') // U+337E
1529 if (c == '\u9686') // U+F9DC
1532 // FIXME: there are still remaining
1533 // characters after U+FA0C.
1534 // for (int k = 0; k < char.MaxValue; k++) {
1535 for (int k = 0; k < '\uFA0D'; k++) {
1536 if (decompIndex [k] == 0 || IsIgnorable (k))
1538 if (decompValues [decompIndex [k]] == c /*&&
1539 decompLength [k] == 1*/ ||
1540 decompLength [k] == 3 &&
1541 decompValues [decompIndex [k] + 1] == c) {
1542 arr [k - offset] = (ushort) v++;
1551 // Korean weight is somewhat complex. It first shifts
1552 // Hangul category from 52-x to 80-x (they are anyways
1553 // computed). CJK ideographs are placed at secondary
1554 // weight, like XX YY 01 zz 01, where XX and YY are
1555 // corresponding "reset" value and zz is 41,43,45...
1557 // Unlike chs,cht and ja, Korean value is a combined
1558 // ushort which is computed as category
1562 offset = 0;//char.MaxValue - arr.Length;
1564 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1565 XmlElement sc = (XmlElement) reset.NextSibling;
1566 // compute "category" and "level 1" for the
1567 // target "reset" Hangle syllable
1568 char rc = reset.InnerText [0];
1569 int ri = ((int) rc - 0xAC00) + 1;
1571 ((ri / 254) * 256 + (ri % 254) + 2);
1572 // Place the characters after the target.
1575 foreach (char c in s) {
1576 arr [(int) c - offset] = p;
1577 cjkKOlv2 [(int) c - offset] = (byte) v;
1587 void FillIgnorables ()
1589 for (int i = 0; i <= char.MaxValue; i++) {
1590 if (Char.GetUnicodeCategory ((char) i) ==
1591 UnicodeCategory.OtherNotAssigned)
1593 if (IsIgnorable (i))
1594 ignorableFlags [i] |= 1;
1595 if (IsIgnorableSymbol (i))
1596 ignorableFlags [i] |= 2;
1597 if (IsIgnorableNonSpacing (i))
1598 ignorableFlags [i] |= 4;
1602 void ModifyUnidata ()
1604 // Modify some decomposition equivalence
1605 for (int i = 0xFE31; i <= 0xFE34; i++) {
1607 decompIndex [i] = 0;
1608 decompLength [i] = 0;
1610 decompType [0x037E] = 0;
1611 decompIndex [0x037E] = 0;
1612 decompLength [0x037E] = 0;
1615 for (int i = 0x3021; i <= 0x3029; i++)
1616 diacritical [i] = 0x4E;
1617 // Korean parens numbers
1618 for (int i = 0x3200; i <= 0x321C; i++)
1619 diacritical [i] = 0xA;
1620 for (int i = 0x3260; i <= 0x327B; i++)
1621 diacritical [i] = 0xC;
1623 // LAMESPEC: these remapping should not be done.
1624 // Windows have incorrect CJK compat mappings.
1625 decompValues [decompIndex [0x32A9]] = 0x91AB;
1626 decompLength [0x323B] = 1;
1627 decompValues [decompIndex [0x323B]] = 0x5B78;
1628 decompValues [decompIndex [0x32AB]] = 0x5B78;
1629 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1630 decompLength [0x3238] = 1;
1631 decompValues [decompIndex [0x3238]] = 0x52DE;
1632 decompValues [decompIndex [0x3298]] = 0x52DE;
1634 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1635 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1636 decompValues [decompIndex [0xFA0C]] = 0x5140;
1637 decompLength [0xFA0C] = 1;
1638 decompIndex [0xF929] = decompLength [0xF929] = 0;
1640 decompValues [decompIndex [0xF92C]] = 0x90DE;
1643 void ModifyParsedValues ()
1645 // some cyrillic diacritical weight. They seem to be
1646 // based on old character names, so it's quicker to
1647 // set them directly here.
1648 diacritical [0x0496] = diacritical [0x0497] = 7;
1649 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1650 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1651 diacritical [0x049C] = diacritical [0x049D] = 9;
1652 diacritical [0x049E] = diacritical [0x049F] = 4;
1653 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1654 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1655 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1657 // number, secondary weights
1659 int [] numarr = numberSecondaryWeightBounds;
1660 for (int i = 0; i < numarr.Length; i += 2, weight++)
1661 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1662 if (Char.IsNumber ((char) cp))
1663 diacritical [cp] = weight;
1665 // Update name part of named characters
1666 for (int i = 0; i < sortableCharNames.Count; i++) {
1667 DictionaryEntry de =
1668 (DictionaryEntry) sortableCharNames [i];
1669 int cp = (int) de.Key;
1670 string renamed = null;
1672 case 0x2101: renamed = "A_1"; break;
1673 case 0x33C3: renamed = "A_2"; break;
1674 case 0x2105: renamed = "C_1"; break;
1675 case 0x2106: renamed = "C_2"; break;
1676 case 0x211E: renamed = "R1"; break;
1677 case 0x211F: renamed = "R2"; break;
1678 // Remove some of them!
1689 sortableCharNames.RemoveAt (i);
1693 if (renamed != null)
1694 sortableCharNames [i] =
1695 new DictionaryEntry (cp, renamed);
1699 void GenerateCore ()
1703 #region Specially ignored // 01
1704 // This will raise "Defined" flag up.
1705 // FIXME: Check If it is really fine. Actually for
1706 // Japanese voice marks this code does remapping.
1707 foreach (char c in specialIgnore)
1708 map [(int) c] = new CharMapEntry (0, 0, 0);
1711 #region Extenders (FF FF)
1712 fillIndex [0xFF] = 0xFF;
1713 char [] specialBiggest = new char [] {
1714 '\u3005', '\u3031', '\u3032', '\u309D',
1715 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1716 '\uFE7C', '\uFE7D', '\uFF70'};
1717 foreach (char c in specialBiggest)
1718 AddCharMap (c, 0xFF, 0);
1721 #region Variable weights
1722 // Controls : 06 03 - 06 3D
1723 fillIndex [0x6] = 3;
1724 for (int i = 0; i < 65536; i++) {
1725 if (IsIgnorable (i))
1728 uc = Char.GetUnicodeCategory (c);
1729 // NEL is whitespace but not ignored here.
1730 if (uc == UnicodeCategory.Control &&
1731 !Char.IsWhiteSpace (c) || c == '\u0085')
1732 AddCharMap (c, 6, 1);
1736 fillIndex [0x6] = 0x80;
1737 AddCharMap ('\'', 6, 0);
1738 AddCharMap ('\uFF07', 6, 1);
1739 AddCharMap ('\uFE63', 6, 1);
1741 // SPECIAL CASE: fill FE32 here in prior to be added
1742 // at 2013. Windows does not always respect NFKD.
1743 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1745 // Hyphen/Dash : 06 81 - 06 90
1746 for (int i = 0; i < char.MaxValue; i++) {
1747 if (!IsIgnorable (i) &&
1748 Char.GetUnicodeCategory ((char) i) ==
1749 UnicodeCategory.DashPunctuation) {
1750 AddCharMapGroup2 ((char) i, 6, 1, 0);
1752 // SPECIAL: add 2027 and 2043
1753 // Maybe they are regarded the
1754 // same hyphens in "central"
1756 AddCharMap ('\u2027', 6, 1);
1757 AddCharMap ('\u2043', 6, 1);
1761 // They are regarded as primarily equivalent to '-'
1762 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1763 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1764 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1766 // Arabic variable weight chars 06 A0 -
1767 fillIndex [6] = 0xA0;
1769 for (int i = 0x64B; i <= 0x650; i++)
1770 AddArabicCharMap ((char) i);
1772 AddCharMapGroup ('\u0652', 6, 1, 0);
1774 AddCharMapGroup ('\u0651', 6, 1, 0);
1778 #region Nonspacing marks // 01
1779 // FIXME: 01 03 - 01 B6 ... annoyance :(
1781 // Combining diacritical marks: 01 DC -
1783 fillIndex [0x1] = 0x41;
1784 for (int i = 0x030E; i <= 0x0326; i++)
1785 if (!IsIgnorable (i))
1786 AddCharMap ((char) i, 0x1, 1);
1787 for (int i = 0x0329; i <= 0x0334; i++)
1788 if (!IsIgnorable (i))
1789 AddCharMap ((char) i, 0x1, 1);
1791 for (int i = 0x0339; i <= 0x0341; i++)
1792 if (!IsIgnorable (i))
1793 AddCharMap ((char) i, 0x1, 1);
1794 fillIndex [0x1] = 0x74;
1795 for (int i = 0x0346; i <= 0x0348; i++)
1796 if (!IsIgnorable (i))
1797 AddCharMap ((char) i, 0x1, 1);
1798 for (int i = 0x02BE; i <= 0x02BF; i++)
1799 if (!IsIgnorable (i))
1800 AddCharMap ((char) i, 0x1, 1);
1801 for (int i = 0x02C1; i <= 0x02C5; i++)
1802 if (!IsIgnorable (i))
1803 AddCharMap ((char) i, 0x1, 1);
1804 for (int i = 0x02CE; i <= 0x02CF; i++)
1805 if (!IsIgnorable (i))
1806 AddCharMap ((char) i, 0x1, 1);
1808 for (int i = 0x02D1; i <= 0x02D3; i++)
1809 if (!IsIgnorable (i))
1810 AddCharMap ((char) i, 0x1, 1);
1811 AddCharMap ('\u02DE', 0x1, 1);
1812 for (int i = 0x02E4; i <= 0x02E9; i++)
1813 if (!IsIgnorable (i))
1814 AddCharMap ((char) i, 0x1, 1);
1816 // FIXME: needs more love here (it should eliminate
1817 // all the hacky code above).
1818 for (int i = 0x0300; i < 0x0370; i++)
1819 if (!IsIgnorable (i) && diacritical [i] != 0
1820 /* especiall here*/ && !map [i].Defined)
1821 map [i] = new CharMapEntry (
1822 0x1, 0x1, diacritical [i]);
1824 // Cyrillic and Armenian nonspacing mark
1825 fillIndex [0x1] = 0x94;
1826 for (int i = 0x400; i < 0x580; i++)
1827 if (!IsIgnorable (i) &&
1828 Char.GetUnicodeCategory ((char) i) ==
1829 UnicodeCategory.NonSpacingMark)
1830 AddCharMap ((char) i, 1, 1);
1832 fillIndex [0x1] = 0x8D;
1833 // syriac dotted nonspacing marks (1)
1834 AddCharMap ('\u0740', 0x1, 1);
1835 AddCharMap ('\u0741', 0x1, 1);
1836 AddCharMap ('\u0742', 0x1, 1);
1837 // syriac oblique nonspacing marks
1838 AddCharMap ('\u0747', 0x1, 1);
1839 AddCharMap ('\u0748', 0x1, 1);
1840 // syriac dotted nonspacing marks (2)
1841 fillIndex [0x1] = 0x94; // this reset is mandatory
1842 AddCharMap ('\u0732', 0x1, 1);
1843 AddCharMap ('\u0735', 0x1, 1);
1844 AddCharMap ('\u0738', 0x1, 1);
1845 AddCharMap ('\u0739', 0x1, 1);
1846 AddCharMap ('\u073C', 0x1, 1);
1847 // SPECIAL CASES: superscripts
1848 AddCharMap ('\u073F', 0x1, 1);
1849 AddCharMap ('\u0711', 0x1, 1);
1851 for (int i = 0x0743; i <= 0x0746; i++)
1852 AddCharMap ((char) i, 0x1, 1);
1853 for (int i = 0x0730; i <= 0x0780; i++)
1854 if (!map [i].Defined &&
1855 Char.GetUnicodeCategory ((char) i) ==
1856 UnicodeCategory.NonSpacingMark)
1857 AddCharMap ((char) i, 0x1, 1);
1859 // LAMESPEC: It should not stop at '\u20E1'. There are
1860 // a few more characters (that however results in
1861 // overflow of level 2 unless we start before 0xDD).
1862 fillIndex [0x1] = 0xDD;
1863 for (int i = 0x20D0; i <= 0x20DC; i++)
1864 AddCharMap ((char) i, 0x1, 1);
1865 fillIndex [0x1] = 0xEC;
1866 for (int i = 0x20DD; i <= 0x20E1; i++)
1867 AddCharMap ((char) i, 0x1, 1);
1868 fillIndex [0x1] = 0x7;
1869 for (int i = 0x302A; i <= 0x302D; i++)
1870 AddCharMap ((char) i, 0x1, 1);
1871 fillIndex [0x1] = 0x50; // I wonder how they are sorted
1872 for (int i = 0x02D4; i <= 0x02D7; i++)
1873 AddCharMap ((char) i, 0x1, 1);
1875 // They are not part of Nonspacing marks, but have
1876 // only diacritical weight.
1877 for (int i = 0x3099; i <= 0x309C; i++)
1878 map [i] = new CharMapEntry (1, 1, 1);
1879 map [0xFF9E] = new CharMapEntry (1, 1, 1);
1880 map [0xFF9F] = new CharMapEntry (1, 1, 2);
1881 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1882 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1883 for (int i = 0x30FC; i <= 0x30FE; i++)
1884 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1886 fillIndex [0x1] = 0xA;
1887 for (int i = 0x0951; i <= 0x0954; i++)
1888 AddCharMap ((char) i, 0x1, 2);
1893 #region Whitespaces // 07 03 -
1894 fillIndex [0x7] = 0x2;
1895 AddCharMap (' ', 0x7, 2);
1896 AddCharMap ('\u00A0', 0x7, 1);
1897 for (int i = 9; i <= 0xD; i++)
1898 AddCharMap ((char) i, 0x7, 1);
1899 for (int i = 0x2000; i <= 0x200B; i++)
1900 AddCharMap ((char) i, 0x7, 1);
1902 fillIndex [0x7] = 0x17;
1903 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1904 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1906 // Characters which used to represent layout control.
1907 // LAMESPEC: Windows developers seem to have thought
1908 // that those characters are kind of whitespaces,
1909 // while they aren't.
1910 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1911 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1915 // category 09 - continued symbols from 08
1916 fillIndex [0x9] = 2;
1918 for (int cp = 0x2300; cp <= 0x237A; cp++)
1919 AddCharMap ((char) cp, 0x9, 1, 0);
1922 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
1923 foreach (DictionaryEntry de in arrowValues) {
1924 int idx = (int) de.Value;
1925 int cp = (int) de.Key;
1926 if (map [cp].Defined)
1928 fillIndex [0x9] = (byte) (0xD8 + idx);
1929 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1933 byte [] boxLv2 = new byte [128];
1934 // 0-63 will be used for those offsets are positive,
1935 // and 64-127 are for negative ones.
1936 for (int i = 0; i < boxLv2.Length; i++)
1938 foreach (DictionaryEntry de in boxValues) {
1939 int cp = (int) de.Key;
1940 int off = (int) de.Value;
1941 if (map [cp].Defined)
1944 fillIndex [0x9] = (byte) (0xE5 + off);
1945 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
1948 fillIndex [0x9] = (byte) (0xE5 + off);
1949 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1952 // Some special characters (slanted)
1953 fillIndex [0x9] = 0xF4;
1954 AddCharMap ('\u2571', 0x9, 3);
1955 AddCharMap ('\u2572', 0x9, 3);
1956 AddCharMap ('\u2573', 0x9, 3);
1958 // FIXME: implement 0A
1960 fillIndex [0xA] = 2;
1961 // byte currency symbols
1962 for (int cp = 0; cp < 0x100; cp++) {
1963 uc = Char.GetUnicodeCategory ((char) cp);
1964 if (!IsIgnorable (cp) &&
1965 uc == UnicodeCategory.CurrencySymbol &&
1967 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1969 // byte other symbols
1970 for (int cp = 0; cp < 0x100; cp++) {
1972 continue; // SPECIAL: skip FIXME: why?
1973 uc = Char.GetUnicodeCategory ((char) cp);
1974 if (!IsIgnorable (cp) &&
1975 uc == UnicodeCategory.OtherSymbol ||
1976 cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
1977 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1980 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1982 for (int cp = 0x2020; cp <= 0x2031; cp++)
1983 if (Char.IsPunctuation ((char) cp))
1984 AddCharMap ((char) cp, 0xA, 1, 0);
1985 // SPECIAL CASES: why?
1986 AddCharMap ('\u203B', 0xA, 1, 0);
1987 AddCharMap ('\u2040', 0xA, 1, 0);
1988 AddCharMap ('\u2041', 0xA, 1, 0);
1989 AddCharMap ('\u2042', 0xA, 1, 0);
1991 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1992 AddCharMap ((char) cp, 0xA, 1, 0);
1994 // 3004 is skipped at first...
1995 for (int cp = 0x3010; cp <= 0x3040; cp++)
1996 if (Char.IsSymbol ((char) cp))
1997 AddCharMap ((char) cp, 0xA, 1, 0);
1998 // SPECIAL CASES: added here
1999 AddCharMap ('\u3004', 0xA, 1, 0);
2000 AddCharMap ('\u327F', 0xA, 1, 0);
2002 for (int cp = 0x2600; cp <= 0x2613; cp++)
2003 AddCharMap ((char) cp, 0xA, 1, 0);
2005 for (int cp = 0x2620; cp <= 0x2770; cp++)
2006 if (Char.IsSymbol ((char) cp))
2007 AddCharMap ((char) cp, 0xA, 1, 0);
2009 for (int i = 0x2440; i < 0x2460; i++)
2010 AddCharMap ((char) i, 0xA, 1, 0);
2012 // SPECIAL CASES: why?
2013 AddCharMap ('\u0E3F', 0xA, 1, 0);
2014 AddCharMap ('\u2117', 0xA, 1, 0);
2015 AddCharMap ('\u20AC', 0xA, 1, 0);
2018 #region Numbers // 0C 02 - 0C E1
2019 fillIndex [0xC] = 2;
2021 // 9F8 : Bengali "one less than the denominator"
2022 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2024 ArrayList numbers = new ArrayList ();
2025 for (int i = 0; i < 65536; i++)
2026 if (!IsIgnorable (i) &&
2027 Char.IsNumber ((char) i) &&
2028 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2031 ArrayList numberValues = new ArrayList ();
2032 foreach (int i in numbers)
2033 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2034 // SPECIAL CASE: Cyrillic Thousand sign
2035 numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2036 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2038 //foreach (DictionaryEntry de in numberValues)
2039 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2041 // FIXME: fillIndex adjustment lines are too
2042 // complicated. It must be simpler.
2043 decimal prevValue = -1;
2044 foreach (DictionaryEntry de in numberValues) {
2045 int cp = (int) de.Key;
2046 decimal currValue = (decimal) de.Value;
2047 bool addnew = false;
2048 if (prevValue < currValue &&
2049 prevValue - (int) prevValue == 0 &&
2053 // Process Hangzhou and Roman numbers
2055 // There are some SPECIAL cases.
2056 if (currValue != 4) // no increment for 4
2060 if (currValue <= 13) {
2064 if (currValue == 11)
2065 AddCharMap ('\u0BF0', 0xC, 1);
2066 xcp = (int) prevValue + 0x2160 - 1;
2067 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2068 xcp = (int) prevValue + 0x2170 - 1;
2069 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2074 if (currValue <= 10) {
2075 xcp = (int) prevValue + 0x3021 - 1;
2076 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2080 if (prevValue < currValue)
2081 prevValue = currValue;
2082 if (map [cp].Defined)
2084 // HangZhou and Roman are add later
2086 if (0x3021 <= cp && cp < 0x302A
2087 || 0x2160 <= cp && cp < 0x216C
2088 || 0x2170 <= cp && cp < 0x217C)
2091 if (cp == 0x215B) // FIXME: why?
2092 fillIndex [0xC] += 2;
2093 else if (cp == 0x3021) // FIXME: why?
2095 if (addnew || cp <= '9') {
2096 int mod = (int) currValue - 1;
2098 if (1 <= currValue && currValue <= 11) {
2100 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2102 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2104 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2106 if (1 <= currValue && currValue <= 20) {
2108 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2110 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2112 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2115 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2117 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2120 // Maybe Bengali digit numbers do not increase
2121 // indexes, but 0x09E6 does.
2122 case 0x09E7: case 0x09E8: case 0x09E9:
2125 case 0x0BF0: case 0x2180: case 0x2181:
2132 if (currValue < 11 || currValue == 1000)
2137 // Add special cases that are not regarded as
2138 // numbers in UnicodeCategory speak.
2141 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2142 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2144 else if (cp == '2' || cp == '6') // FIXME: why?
2149 fillIndex [0xC] = 0xFF;
2150 AddCharMap ('\u221E', 0xC, 1);
2153 #region Letters and NonSpacing Marks (general)
2155 // ASCII Latin alphabets
2156 for (int i = 0; i < alphabets.Length; i++)
2157 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2159 // non-ASCII Latin alphabets
2160 // FIXME: there is no such characters that are placed
2161 // *after* "alphabets" array items. This is nothing
2162 // more than a hack that creates dummy weight for
2163 // primary characters.
2164 for (int i = 0x0080; i < 0x0300; i++) {
2165 if (!Char.IsLetter ((char) i))
2167 // For those Latin Letters which has NFKD are
2168 // not added as independent primary character.
2169 if (decompIndex [i] != 0)
2172 // 1.some alphabets have primarily
2173 // equivalent ASCII alphabets.
2174 // 2.some have independent primary weights,
2175 // but inside a-to-z range.
2176 // 3.there are some expanded characters that
2177 // are not part of Unicode Standard NFKD.
2178 // 4. some characters are letter in IsLetter
2179 // but not in sortkeys (maybe unicode version
2180 // difference caused it).
2182 // 1. skipping them does not make sense
2183 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2184 // case 0x184: case 0x185: case 0x186: case 0x189:
2185 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2186 // case 0x194: case 0x195: case 0x196: case 0x19A:
2187 // case 0x19B: case 0x19C:
2188 // 2. skipping them does not make sense
2189 // case 0x14A: // Ng
2190 // case 0x14B: // ng
2194 case 0xDE: // Icelandic Thorn
2195 case 0xFE: // Icelandic Thorn
2196 case 0xDF: // German ss
2197 case 0xFF: // German ss
2199 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2200 // not classified yet
2201 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2202 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2203 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2207 AddCharMapGroup ((char) i, 0xE, 1, 0);
2211 fillIndex [0xF] = 02;
2212 for (int i = 0x0380; i < 0x0390; i++)
2213 if (Char.IsLetter ((char) i))
2214 AddLetterMap ((char) i, 0xF, 1);
2215 fillIndex [0xF] = 02;
2216 for (int i = 0x0391; i < 0x03CF; i++)
2217 if (Char.IsLetter ((char) i))
2218 AddLetterMap ((char) i, 0xF, 1);
2219 fillIndex [0xF] = 0x40;
2220 for (int i = 0x03D0; i < 0x0400; i++)
2221 if (Char.IsLetter ((char) i))
2222 AddLetterMap ((char) i, 0xF, 1);
2225 // Cyrillic letters are sorted like Latin letters i.e.
2226 // containing culture-specific letters between the
2227 // standard Cyrillic sequence.
2229 // We can't use UCA here; it has different sorting.
2230 char [] orderedCyrillic = new char [] {
2231 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2232 '\u0452', // DJE for Serbocroatian
2234 '\u0454', // IE for Ukrainian
2238 '\u0456', // Byelorussian-Ukrainian I
2248 '\u043F', '\u0440', '\u0441', '\u0442',
2249 '\u045B', // TSHE for Serbocroatian
2251 '\u045E', // Short U for Byelorussian
2252 '\u04B1', // Straight U w/ stroke (diacritical!)
2253 '\u0444', '\u0445', '\u0446', '\u0447',
2255 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2256 '\u044D', '\u044E', '\u044F'};
2258 // For some characters here is a map to basic cyrillic
2259 // letters. See UnicodeData.txt character names for
2260 // the sources. Here I simply declare an equiv. array.
2261 // The content characters are map from U+490(,491),
2262 // skipping small letters.
2263 char [] cymap_src = new char [] {
2264 '\u0433', '\u0433', '\u0433', '\u0436',
2265 '\u0437', '\u043A', '\u043A', '\u043A',
2266 '\u043A', '\u043D', '\u043D', '\u043F',
2267 '\u0445', '\u0441', '\u0442', '\u0443',
2268 '\u0443', '\u0445', '\u0446', '\u0447',
2269 '\u0447', '\u0432', '\u0435', '\u0435',
2270 '\u0406', '\u0436', '\u043A', '\u043D',
2271 '\u0447', '\u0435'};
2273 fillIndex [0x10] = 0x8D;
2274 for (int i = 0x0460; i < 0x0481; i++) {
2275 if (Char.IsLetter ((char) i)) {
2277 // U+476/477 have the same
2278 // primary weight as U+474/475.
2279 fillIndex [0x10] -= 3;
2280 AddLetterMap ((char) i, 0x10, 3);
2284 fillIndex [0x10] = 0x6;
2285 for (int i = 0; i < orderedCyrillic.Length; i++) {
2286 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2287 if (!IsIgnorable ((int) c) &&
2288 Char.IsLetter (c) &&
2290 AddLetterMap (c, 0x10, 0);
2291 fillIndex [0x10] += 3;
2295 for (int i = 0; i < cymap_src.Length; i++) {
2296 char c = cymap_src [i];
2297 fillIndex [0x10] = map [c].Level1;
2298 int c2 = 0x0490 + i * 2;
2299 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2303 fillIndex [0x11] = 0x3;
2304 fillIndex [0x1] = 0x98;
2305 for (int i = 0x0531; i < 0x0586; i++) {
2306 if (i == 0x0559 || i == 0x55A)
2307 AddCharMap ((char) i, 1, 1);
2308 if (Char.IsLetter ((char) i))
2309 AddLetterMap ((char) i, 0x11, 1);
2314 fillIndex [0x12] = 0x2;
2315 for (int i = 0x05D0; i < 0x05FF; i++)
2316 if (Char.IsLetter ((char) i))
2317 AddLetterMap ((char) i, 0x12, 1);
2319 fillIndex [0x1] = 0x3;
2320 for (int i = 0x0591; i <= 0x05C2; i++) {
2321 if (i == 0x05A3 || i == 0x05BB)
2324 AddCharMap ((char) i, 0x1, 1);
2328 fillIndex [0x1] = 0x8E;
2329 fillIndex [0x13] = 0x3;
2330 for (int i = 0x0621; i <= 0x064A; i++) {
2332 if (Char.GetUnicodeCategory ((char) i)
2333 != UnicodeCategory.OtherLetter) {
2334 // FIXME: arabic nonspacing marks are
2335 // in different order.
2336 AddCharMap ((char) i, 0x1, 1);
2339 // map [i] = new CharMapEntry (0x13,
2340 // (byte) arabicLetterPrimaryValues [i], 1);
2342 (byte) arabicLetterPrimaryValues [i];
2343 byte formDiacritical = 8; // default
2346 case 0x0622: formDiacritical = 9; break;
2347 case 0x0623: formDiacritical = 0xA; break;
2348 case 0x0624: formDiacritical = 5; break;
2349 case 0x0625: formDiacritical = 0xB; break;
2350 case 0x0626: formDiacritical = 7; break;
2351 case 0x0649: formDiacritical = 5; break;
2352 case 0x064A: formDiacritical = 7; break;
2354 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2356 for (int i = 0x0670; i < 0x0673; i++)
2357 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2358 fillIndex [0x13] = 0x84;
2359 for (int i = 0x0674; i < 0x06D6; i++)
2360 if (Char.IsLetter ((char) i))
2361 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2365 // FIXME: this could be fixed in more decent way
2366 for (int i = 0x0958; i <= 0x095F; i++)
2367 diacritical [i] = 8;
2369 // FIXME: it does seem straight codepoint mapping.
2370 fillIndex [0x14] = 04;
2371 for (int i = 0x0901; i < 0x0905; i++)
2372 if (!IsIgnorable (i))
2373 AddLetterMap ((char) i, 0x14, 2);
2374 fillIndex [0x14] = 0xB;
2375 for (int i = 0x0905; i < 0x093A; i++) {
2377 AddCharMap ('\u0929', 0x14, 0, 8);
2379 AddCharMap ('\u0931', 0x14, 0, 8);
2381 AddCharMap ('\u0934', 0x14, 0, 8);
2382 if (Char.IsLetter ((char) i))
2383 AddLetterMap ((char) i, 0x14, 4);
2385 AddCharMap ('\u0960', 0x14, 4);
2387 AddCharMap ('\u0961', 0x14, 4);
2389 fillIndex [0x14] = 0xDA;
2390 for (int i = 0x093E; i < 0x0945; i++)
2391 if (!IsIgnorable (i))
2392 AddLetterMap ((char) i, 0x14, 2);
2393 fillIndex [0x14] = 0xEC;
2394 for (int i = 0x0945; i < 0x094F; i++)
2395 if (!IsIgnorable (i))
2396 AddLetterMap ((char) i, 0x14, 2);
2400 fillIndex [0x15] = 02;
2401 for (int i = 0x0980; i < 0x9FF; i++) {
2402 if (IsIgnorable (i))
2405 fillIndex [0x15] = 0x3B;
2406 switch (Char.GetUnicodeCategory ((char) i)) {
2407 case UnicodeCategory.NonSpacingMark:
2408 case UnicodeCategory.DecimalDigitNumber:
2409 case UnicodeCategory.OtherNumber:
2412 AddLetterMap ((char) i, 0x15, 1);
2415 fillIndex [0x1] = 0x3;
2416 for (int i = 0x0981; i < 0x0A00; i++)
2417 if (Char.GetUnicodeCategory ((char) i) ==
2418 UnicodeCategory.NonSpacingMark)
2419 AddCharMap ((char) i, 0x1, 1);
2421 // Gurmukhi. orderedGurmukhi is from UCA
2422 // FIXME: it does not look equivalent to UCA.
2423 fillIndex [0x16] = 04;
2424 fillIndex [0x1] = 3;
2425 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2426 char c = orderedGurmukhi [i];
2427 if (IsIgnorable ((int) c))
2429 if (IsIgnorableNonSpacing (c)) {
2430 AddLetterMap (c, 0x1, 1);
2433 if (c == '\u0A3C' || c == '\u0A4D' ||
2434 '\u0A66' <= c && c <= '\u0A71')
2439 case '\u0A33': case '\u0A36': case '\u0A16':
2440 case '\u0A17': case '\u0A5B': case '\u0A5E':
2444 if (c == '\u0A3E') // Skip
2445 fillIndex [0x16] = 0xC0;
2446 AddLetterMap (c, 0x16, shift);
2449 // Gujarati. orderedGujarati is from UCA
2450 fillIndex [0x17] = 0x4;
2452 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2453 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2454 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2455 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2456 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2457 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2458 // letters go first.
2459 for (int i = 0; i < orderedGujarati.Length; i++) {
2461 char c = orderedGujarati [i];
2462 if (Char.IsLetter (c)) {
2464 if (c == '\u0AB3' || c == '\u0A32')
2466 if (c == '\u0A33') {
2467 AddCharMap ('\u0A32', 0x17, 0);
2468 AddCharMap ('\u0A33', 0x17, 4, 4);
2472 AddCharMap ('\u0AE0', 0x17, 0, 5);
2473 AddCharMap (c, 0x17, 4);
2476 AddCharMap ('\u0AB3', 0x17, 6);
2480 byte gujaratiShift = 4;
2481 fillIndex [0x17] = 0xC0;
2482 for (int i = 0; i < orderedGujarati.Length; i++) {
2483 char c = orderedGujarati [i];
2484 if (fillIndex [0x17] == 0xCC)
2486 if (!Char.IsLetter (c)) {
2489 AddCharMap ('\u0A81', 0x17, 2);
2492 AddLetterMap (c, 0x17, gujaratiShift);
2497 fillIndex [0x1] = 03;
2498 fillIndex [0x18] = 02;
2499 for (int i = 0x0B00; i < 0x0B7F; i++) {
2500 switch (Char.GetUnicodeCategory ((char) i)) {
2501 case UnicodeCategory.NonSpacingMark:
2502 case UnicodeCategory.DecimalDigitNumber:
2503 AddLetterMap ((char) i, 0x1, 1);
2506 AddLetterMap ((char) i, 0x18, 1);
2510 fillIndex [0x19] = 2;
2511 AddCharMap ('\u0BD7', 0x19, 0);
2512 fillIndex [0x19] = 0xA;
2514 for (int i = 0x0B82; i <= 0x0B94; i++)
2515 if (!IsIgnorable ((char) i))
2516 AddCharMap ((char) i, 0x19, 2);
2518 fillIndex [0x19] = 0x28;
2519 // The array for Tamil consonants is a constant.
2520 // Windows have almost similar sequence to TAM from
2521 // tamilnet but a bit different in Grantha.
2522 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2523 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2525 fillIndex [0x19] = 0x82;
2526 for (int i = 0x0BBE; i < 0x0BCD; i++)
2527 if (Char.GetUnicodeCategory ((char) i) ==
2528 UnicodeCategory.SpacingCombiningMark
2530 AddLetterMap ((char) i, 0x19, 2);
2533 fillIndex [0x1A] = 0x4;
2534 for (int i = 0x0C00; i < 0x0C62; i++) {
2535 if (i == 0x0C55 || i == 0x0C56)
2537 AddCharMap ((char) i, 0x1A, 3);
2538 char supp = (i == 0x0C0B) ? '\u0C60':
2539 i == 0x0C0C ? '\u0C61' : char.MinValue;
2540 if (supp == char.MinValue)
2542 AddCharMap (supp, 0x1A, 3);
2546 fillIndex [0x1B] = 4;
2547 for (int i = 0x0C80; i < 0x0CE5; i++) {
2548 if (i == 0x0CD5 || i == 0x0CD6)
2550 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2551 continue; // shift after 0xCB9
2552 AddCharMap ((char) i, 0x1B, 3);
2554 // SPECIAL CASES: but why?
2555 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2556 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2557 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2560 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2564 fillIndex [0x1C] = 2;
2565 fillIndex [0x1] = 3;
2566 for (int i = 0x0D02; i < 0x0D61; i++) {
2567 // FIXME: I avoided MSCompatUnicodeTable usage
2568 // here (it results in recursion). So check if
2569 // using NonSpacingMark makes sense or not.
2570 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2571 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2572 AddCharMap ((char) i, 0x1C, 1);
2573 else if (!IsIgnorable ((char) i))
2574 AddCharMap ((char) i, 1, 1);
2577 // Thai ... note that it breaks 0x1E wall after E2B!
2578 // Also, all Thai characters have level 2 value 3.
2579 fillIndex [0x1E] = 2;
2580 fillIndex [0x1] = 3;
2581 for (int i = 0xE40; i <= 0xE44; i++)
2582 AddCharMap ((char) i, 0x1E, 1, 3);
2583 for (int i = 0xE01; i < 0xE2B; i++)
2584 AddCharMap ((char) i, 0x1E, 6, 3);
2585 fillIndex [0x1F] = 5;
2586 for (int i = 0xE2B; i < 0xE30; i++)
2587 AddCharMap ((char) i, 0x1F, 6, 3);
2588 fillIndex [0x1F] = 0x1E;
2589 for (int i = 0xE30; i < 0xE3B; i++)
2590 AddCharMap ((char) i, 0x1F, 1, 3);
2591 // some Thai characters remains.
2592 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2593 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2594 foreach (char c in specialThai)
2595 AddCharMap (c, 0x1F, 1, 3);
2597 for (int i = 0xE00; i < 0xE80; i++)
2598 if (Char.GetUnicodeCategory ((char) i) ==
2599 UnicodeCategory.NonSpacingMark)
2600 AddCharMap ((char) i, 1, 1);
2603 fillIndex [0x1F] = 2;
2604 fillIndex [0x1] = 3;
2605 for (int i = 0xE80; i < 0xEDF; i++) {
2606 if (IsIgnorable ((char) i))
2608 else if (Char.IsLetter ((char) i))
2609 AddCharMap ((char) i, 0x1F, 1);
2610 else if (Char.GetUnicodeCategory ((char) i) ==
2611 UnicodeCategory.NonSpacingMark)
2612 AddCharMap ((char) i, 1, 1);
2615 // Georgian. orderedGeorgian is from UCA DUCET.
2616 fillIndex [0x21] = 5;
2617 for (int i = 0; i < orderedGeorgian.Length; i++) {
2618 char c = orderedGeorgian [i];
2619 if (map [(int) c].Defined)
2621 AddCharMap (c, 0x21, 0);
2623 AddCharMap ((char) (c - 0x30), 0x21, 0);
2624 fillIndex [0x21] += 5;
2628 fillIndex [0x22] = 2;
2629 int kanaOffset = 0x3041;
2630 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2632 for (int gyo = 0; gyo < 9; gyo++) {
2633 for (int dan = 0; dan < 5; dan++) {
2634 if (gyo == 7 && dan % 2 == 1) {
2637 kanaOffset -= 2; // There is no space for yi and ye.
2640 int cp = kanaOffset + dan * kanaLines [gyo];
2641 // small lines (a-gyo, ya-gyo)
2642 if (gyo == 0 || gyo == 7) {
2643 AddKanaMap (cp, 1); // small
2644 AddKanaMap (cp + 1, 1);
2647 AddKanaMap (cp, kanaLines [gyo]);
2651 // add small 'ka' (before normal one)
2652 AddKanaMap (0x30F5, 1);
2656 // add small 'ke' (before normal one)
2657 AddKanaMap (0x30F6, 1);
2661 // add small 'Tsu' (before normal one)
2662 AddKanaMap (0x3063, 1);
2666 fillIndex [0x22] += 3;
2667 kanaOffset += 5 * kanaLines [gyo];
2670 // Wa-gyo is almost special, so I just manually add.
2671 AddLetterMap ((char) 0x308E, 0x22, 0);
2672 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2673 AddLetterMap ((char) 0x308F, 0x22, 0);
2674 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2676 AddLetterMap ((char) 0x3090, 0x22, 0);
2677 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2678 fillIndex [0x22] += 2;
2679 // no "Wu" in Japanese.
2680 AddLetterMap ((char) 0x3091, 0x22, 0);
2681 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2683 AddLetterMap ((char) 0x3092, 0x22, 0);
2684 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2686 fillIndex [0x22] = 0x80;
2687 AddLetterMap ((char) 0x3093, 0x22, 0);
2688 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2690 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2691 map [0x30A6].Level1, 3);// voiced hiragana U
2692 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2693 map [0x30A6].Level1, 3);// voiced katakana U
2695 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2696 map [0x30AB].Level1, 0);// small katakana Ka
2697 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2698 map [0x30B1].Level1, 0);// small katakana Ke
2700 for (int i = 0x30F7; i < 0x30FB; i++)
2701 map [i] = new CharMapEntry (map [i - 8].Category,
2705 // JIS Japanese square chars.
2706 fillIndex [0x22] = 0x97;
2707 jisJapanese.Sort (JISComparer.Instance);
2708 foreach (JISCharacter j in jisJapanese)
2709 if (0x3300 <= j.CP && j.CP <= 0x3357)
2710 AddCharMap ((char) j.CP, 0x22, 1);
2711 // non-JIS Japanese square chars.
2712 nonJisJapanese.Sort (NonJISComparer.Instance);
2713 foreach (NonJISCharacter j in nonJisJapanese)
2714 AddCharMap ((char) j.CP, 0x22, 1);
2717 fillIndex [0x23] = 0x02;
2718 for (int i = 0x3105; i <= 0x312C; i++)
2719 AddCharMap ((char) i, 0x23, 1);
2721 // Estrangela: ancient Syriac
2722 fillIndex [0x24] = 0x0B;
2723 // FIXME: is 0x71E really alternative form?
2724 ArrayList syriacAlternatives = new ArrayList (
2725 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2726 for (int i = 0x0710; i <= 0x072C; i++) {
2727 if (i == 0x0711) // NonSpacingMark
2729 if (syriacAlternatives.Contains (i))
2731 AddCharMap ((char) i, 0x24, 4);
2736 foreach (int cp in syriacAlternatives)
2737 map [cp] = new CharMapEntry (0x24,
2738 (byte) (map [cp - 1].Level1 + 2),
2740 // FIXME: Syriac NonSpacingMark should go here.
2743 // FIXME: it turned out that it does not look like UCA
2744 fillIndex [0x24] = 0x6E;
2745 fillIndex [0x1] = 0xAC;
2746 for (int i = 0; i < orderedThaana.Length; i++) {
2747 char c = orderedThaana [i];
2748 if (IsIgnorableNonSpacing ((int) c))
2749 AddCharMap (c, 1, 1);
2750 AddCharMap (c, 0x24, 2);
2751 if (c == '\u0782') // SPECIAL CASE: why?
2752 fillIndex [0x24] += 2;
2756 // FIXME: Add more culture-specific letters (that are
2757 // not supported in Windows collation) here.
2759 // Surrogate ... they are computed.
2764 // Unlike UCA Windows Hangul sequence mixes Jongseong
2765 // with Choseong sequence as well as Jungseong,
2766 // adjusted to have the same primary weight for the
2767 // same base character. So it is impossible to compute
2770 // Here I introduce an ordered sequence of mixed
2771 // 'commands' and 'characters' that is similar to
2773 // - ',' increases primary weight.
2774 // - [A B] means a range, increasing index
2775 // - {A B} means a range, without increasing index
2776 // - '=' is no operation (it means the characters
2777 // of both sides have the same weight).
2778 // - '>' inserts a Hangul Syllable block that
2779 // contains 0x251 characters.
2780 // - '<' decreases the index
2781 // - '0'-'9' means skip count
2782 // - whitespaces are ignored
2785 string hangulSequence =
2786 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2787 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2788 + "<{\u1113 \u1116}, \u3165,"
2789 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2790 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2791 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2792 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2793 + "[\u11D1 \u11D2], \u11B2,"
2794 + "[\u11D3 \u11D5], \u11B3,"
2795 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2796 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2797 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2798 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2799 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2800 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2801 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2802 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2803 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2804 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2805 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2806 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2807 + "\u11F1,, \u11F2,,,"
2808 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2809 + "<\u114D, \u110D,, >"
2810 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2811 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2812 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2813 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2814 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2818 byte hangulCat = 0x52;
2819 fillIndex [hangulCat] = 0x2;
2821 int syllableBlock = 0;
2822 for (int n = 0; n < hangulSequence.Length; n++) {
2823 char c = hangulSequence [n];
2825 if (Char.IsWhiteSpace (c))
2831 IncrementSequentialIndex (ref hangulCat);
2834 if (fillIndex [hangulCat] == 2)
2835 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2836 fillIndex [hangulCat]--;
2839 IncrementSequentialIndex (ref hangulCat);
2840 for (int l = 0; l < 0x15; l++)
2841 for (int v = 0; v < 0x1C; v++) {
2843 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2844 IncrementSequentialIndex (ref hangulCat);
2849 start = hangulSequence [n + 1];
2850 end = hangulSequence [n + 3];
2851 for (int i = start; i <= end; i++) {
2852 AddCharMap ((char) i, hangulCat, 0);
2854 IncrementSequentialIndex (ref hangulCat);
2856 n += 4; // consumes 5 characters for this operation
2859 start = hangulSequence [n + 1];
2860 end = hangulSequence [n + 3];
2861 for (int i = start; i <= end; i++)
2862 AddCharMap ((char) i, hangulCat, 0);
2863 n += 4; // consumes 5 characters for this operation
2866 AddCharMap (c, hangulCat, 0);
2872 for (int i = 0x3200; i < 0x3300; i++) {
2873 if (IsIgnorable (i) || map [i].Defined)
2877 if (decompLength [i] == 4 &&
2878 decompValues [decompIndex [i]] == '(')
2879 ch = decompIndex [i] + 1;
2881 else if (decompLength [i] == 2 &&
2882 decompValues [decompIndex [i] + 1] == '\u1161')
2883 ch = decompIndex [i];
2884 else if (decompLength [i] == 1)
2885 ch = decompIndex [i];
2888 ch = decompValues [ch];
2889 if (ch < 0x1100 || 0x1200 < ch &&
2890 ch < 0xAC00 || 0xD800 < ch)
2894 int offset = i < 0x3260 ? 1 : 0;
2895 if (0x326E <= i && i <= 0x3273)
2898 map [i] = new CharMapEntry (map [ch].Category,
2899 (byte) (map [ch].Level1 + offset),
2901 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2907 // Letterlike characters and CJK compatibility square
2908 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2909 int [] counts = new int ['Z' - 'A' + 1];
2910 char [] namedChars = new char [sortableCharNames.Count];
2912 foreach (DictionaryEntry de in sortableCharNames) {
2913 counts [((string) de.Value) [0] - 'A']++;
2914 namedChars [nCharNames++] = (char) ((int) de.Key);
2916 nCharNames = 0; // reset
2917 for (int a = 0; a < counts.Length; a++) {
2918 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2919 for (int i = 0; i < counts [a]; i++)
2920 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2921 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2924 // CJK unified ideograph.
2926 fillIndex [cjkCat] = 0x2;
2927 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2928 if (!IsIgnorable (cp))
2929 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2930 // CJK Extensions goes here.
2931 // LAMESPEC: With this Windows style CJK layout, it is
2932 // impossible to add more CJK ideograph i.e. 0x9FA6-
2933 // 0x9FBB can never be added w/o breaking compat.
2934 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2935 if (!IsIgnorable (cp))
2936 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2938 // PrivateUse ... computed.
2939 // remaining Surrogate ... computed.
2941 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2942 // non-alphanumeric ASCII except for: + - < = > '
2943 for (int i = 0x21; i < 0x7F; i++) {
2944 // SPECIAL CASE: 02C6 looks regarded as
2945 // equivalent to '^', which does not conform
2946 // to Unicode standard character database.
2948 AddCharMap ('\u2045', 0x7, 0, 0x1C);
2950 AddCharMap ('\u2046', 0x7, 0, 0x1C);
2952 AddCharMap ('\u02C6', 0x7, 0, 3);
2954 AddCharMap ('\u02CB', 0x7, 0, 3);
2956 if (Char.IsLetterOrDigit ((char) i)
2957 || "+-<=>'".IndexOf ((char) i) >= 0)
2958 continue; // they are not added here.
2960 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2961 // Insert 3001 after ',' and 3002 after '.'
2963 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2965 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2967 AddCharMap ('\uFE30', 0x7, 1, 0);
2971 #region 07 - Punctuations and something else
2972 for (int i = 0xA0; i < char.MaxValue; i++) {
2973 if (IsIgnorable (i))
2976 // FIXME: actually those reset should not be
2977 // done but here I put for easy goal.
2981 fillIndex [0x7] = 0xE2;
2983 fillIndex [0x7] = 0x77;
2985 fillIndex [0x7] = 0x93;
2987 if (0x02C8 <= i && i <= 0x02CD)
2988 continue; // nonspacing marks
2990 // SPECIAL CASE: maybe they could be allocated
2991 // dummy NFKD mapping and no special processing
2992 // would be required here.
2994 AddCharMap ('\u02C9', 0x7, 0, 3);
2996 AddCharMap ('\u02CA', 0x7, 0, 3);
2998 AddCharMap ('\u02D8', 0x7, 0, 3);
3012 switch (Char.GetUnicodeCategory ((char) i)) {
3013 case UnicodeCategory.OtherPunctuation:
3014 case UnicodeCategory.ClosePunctuation:
3015 case UnicodeCategory.OpenPunctuation:
3016 case UnicodeCategory.ConnectorPunctuation:
3017 case UnicodeCategory.InitialQuotePunctuation:
3018 case UnicodeCategory.FinalQuotePunctuation:
3019 case UnicodeCategory.ModifierSymbol:
3020 // SPECIAL CASES: // 0xA
3021 if (0x2020 <= i && i <= 0x2031)
3023 if (i == 0x3003) // added later
3025 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3028 if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3029 goto case UnicodeCategory.OtherPunctuation;
3035 // FIXME: it should not need to reset level 1, but
3036 // it's for easy goal.
3037 fillIndex [0x7] = 0xB6;
3038 for (int i = 0x2400; i <= 0x2424; i++)
3039 AddCharMap ((char) i, 0x7, 1, 0);
3041 // FIXME: what are they?
3042 AddCharMap ('\u3003', 0x7, 1);
3043 AddCharMap ('\u3006', 0x7, 1);
3044 AddCharMap ('\u02D0', 0x7, 1);
3045 AddCharMap ('\u10FB', 0x7, 1);
3046 AddCharMap ('\u0950', 0x7, 1);
3047 AddCharMap ('\u093D', 0x7, 1);
3048 AddCharMap ('\u0964', 0x7, 1);
3049 AddCharMap ('\u0965', 0x7, 1);
3050 AddCharMap ('\u0970', 0x7, 1);
3054 #region category 08 - symbols
3055 fillIndex [0x8] = 2;
3056 // Here Windows mapping is not straightforward. It is
3057 // not based on computation but seems manual sorting.
3058 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3059 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3060 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3061 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3062 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3063 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3064 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3065 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3066 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3067 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3068 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3069 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3070 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3072 for (int cp = 0; cp < 0x2300; cp++) {
3073 if (cp == 0xAC) // SPECIAL CASE: skip
3076 cp = 0x2200; // skip to 2200
3077 fillIndex [0x8] = 0x21;
3080 fillIndex [0x8] = 0x3;
3082 fillIndex [0x8] = 0xAB;
3084 fillIndex [0x8] = 0xB9;
3085 if (!map [cp].Defined &&
3086 // Char.GetUnicodeCategory ((char) cp) ==
3087 // UnicodeCategory.MathSymbol)
3088 Char.IsSymbol ((char) cp))
3089 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3090 // SPECIAL CASES: no idea why Windows sorts as such
3093 AddCharMap ('\u227B', 0x8, 1, 0);
3094 AddCharMap ('\u22B1', 0x8, 1, 0);
3097 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3098 AddCharMapGroup ('\u226A', 0x8, 1, 0);
3099 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3100 AddCharMapGroup ('\u226B', 0x8, 1, 0);
3103 AddCharMap ('\u01C0', 0x8, 1, 0);
3104 AddCharMap ('\u01C1', 0x8, 1, 0);
3105 AddCharMap ('\u01C2', 0x8, 1, 0);
3113 // Characters w/ diacritical marks (NFKD)
3114 for (int i = 0; i <= char.MaxValue; i++) {
3115 if (map [i].Defined || IsIgnorable (i))
3117 if (decompIndex [i] == 0)
3120 int start = decompIndex [i];
3121 int primaryChar = decompValues [start];
3122 int secondary = diacritical [i];
3124 int length = decompLength [i];
3125 // special processing for parenthesized ones.
3127 decompValues [start] == '(' &&
3128 decompValues [start + 2] == ')') {
3129 primaryChar = decompValues [start + 1];
3133 if (map [primaryChar].Level1 == 0)
3136 for (int l = 1; l < length; l++) {
3137 int c = decompValues [start + l];
3138 if (map [c].Level1 != 0)
3140 secondary += diacritical [c];
3144 map [i] = new CharMapEntry (
3145 map [primaryChar].Category,
3146 map [primaryChar].Level1,
3151 // Diacritical weight adjustment
3154 diacritical [0x624] = 0x5;
3155 diacritical [0x626] = 0x7;
3156 diacritical [0x622] = 0x9;
3157 diacritical [0x623] = 0xA;
3158 diacritical [0x625] = 0xB;
3159 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3160 diacritical [0x64A] = 0x7; // Yaa'
3162 for (int i = 0; i < char.MaxValue; i++) {
3164 byte cat = map [i].Category;
3166 case 0xE: // Latin diacritics
3167 case 0x22: // Japanese: circled characters
3168 mod = diacritical [i];
3170 case 0x13: // Arabic
3171 if (diacritical [i] == 0 && i >= 0xFE8D)
3172 mod = 0x8; // default for arabic
3175 if (0x52 <= cat && cat <= 0x7F) // Hangul
3176 mod = diacritical [i];
3178 map [i] = new CharMapEntry (
3179 cat, map [i].Level1, mod);
3182 // FIXME: this is halfly hack but those NonSpacingMark
3183 // characters and still undefined are likely to
3185 for (int i = 0; i < char.MaxValue; i++) {
3186 if (map [i].Defined ||
3195 if (Char.GetUnicodeCategory ((char) i) !=
3196 UnicodeCategory.NonSpacingMark)
3200 if (diacritical [i] != 0)
3201 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3203 AddCharMap ((char) i, 1, 1);
3209 private void IncrementSequentialIndex (ref byte hangulCat)
3211 fillIndex [hangulCat]++;
3212 if (fillIndex [hangulCat] == 0) { // overflown
3214 fillIndex [hangulCat] = 0x2;
3218 // Reset fillIndex to fixed value and call AddLetterMap().
3219 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3221 fillIndex [category] = alphaWeight;
3222 AddLetterMap (c, category, 0);
3224 ArrayList al = latinMap [c] as ArrayList;
3228 foreach (int cp in al)
3229 AddLetterMap ((char) cp, category, 0);
3232 private void AddKanaMap (int i, byte voices)
3234 for (byte b = 0; b < voices; b++) {
3235 char c = (char) (i + b);
3236 byte arg = (byte) (b > 0 ? b + 2 : 0);
3238 AddLetterMapCore (c, 0x22, 0, arg, false);
3240 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3244 private void AddLetterMap (char c, byte category, byte updateCount)
3246 AddLetterMapCore (c, category, updateCount, 0, true);
3249 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3252 // <small> updates index
3253 c2 = ToSmallForm (c);
3255 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3256 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3257 if (c2 != c && !map [(int) c2].Defined)
3258 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3259 bool doUpdate = true;
3260 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3263 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3265 fillIndex [category] += updateCount;
3268 private bool AddCharMap (char c, byte category, byte increment)
3270 return AddCharMap (c, category, increment, 0);
3273 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3275 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3276 return false; // do nothing
3277 map [(int) c] = new CharMapEntry (category,
3278 category == 1 ? alt : fillIndex [category],
3279 category == 1 ? fillIndex [category] : alt);
3280 fillIndex [category] += increment;
3285 // Adds characters to table in the order below
3286 // (+ increases weight):
3290 // <full> | <super> | <sub>
3291 // <circle> | <wide> (| <narrow>)
3295 // level2 is fixed (does not increase).
3296 int [] sameWeightItems = new int [] {
3297 DecompositionFraction,
3301 DecompositionCircle,
3303 DecompositionNarrow,
3305 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3307 AddCharMapGroup (c, category, updateCount, level2, false);
3310 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3312 if (map [(int) c].Defined)
3316 level2 = diacritical [(int) c];
3318 char small = char.MinValue;
3319 char vertical = char.MinValue;
3320 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3322 object smv = nfkd [(byte) DecompositionSmall];
3324 small = (char) ((int) smv);
3325 object vv = nfkd [(byte) DecompositionVertical];
3327 vertical = (char) ((int) vv);
3330 // <small> updates index
3331 if (small != char.MinValue) {
3332 if (level2 == 0 && deferLevel2)
3333 level2 = diacritical [small];
3334 AddCharMap (small, category, updateCount, level2);
3338 AddCharMap (c, category, 0, level2);
3341 foreach (int weight in sameWeightItems) {
3342 object wv = nfkd [(byte) weight];
3345 level2 = diacritical [(int) wv];
3346 AddCharMap ((char) ((int) wv), category, 0, level2);
3351 // update index here.
3352 fillIndex [category] += updateCount;
3354 if (vertical != char.MinValue) {
3355 if (level2 == 0 && deferLevel2)
3356 level2 = diacritical [vertical];
3357 AddCharMap (vertical, category, updateCount, level2);
3361 private void AddCharMapCJK (char c, ref byte category)
3363 AddCharMap (c, category, 0, 0);
3364 IncrementSequentialIndex (ref category);
3366 // Special. I wonder why but Windows skips 9E F9.
3367 if (category == 0x9E && fillIndex [category] == 0xF9)
3368 IncrementSequentialIndex (ref category);
3371 private void AddCharMapGroupCJK (char c, ref byte category)
3373 AddCharMapCJK (c, ref category);
3375 // LAMESPEC: see below.
3376 if (c == '\u5B78') {
3377 AddCharMapCJK ('\u32AB', ref category);
3378 AddCharMapCJK ('\u323B', ref category);
3380 if (c == '\u52DE') {
3381 AddCharMapCJK ('\u3298', ref category);
3382 AddCharMapCJK ('\u3238', ref category);
3385 AddCharMapCJK ('\u32A2', ref category);
3387 // Especially this mapping order totally does
3388 // not make sense to me.
3389 AddCharMapCJK ('\u32A9', ref category);
3391 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3394 for (byte weight = 0; weight <= 0x12; weight++) {
3395 object wv = nfkd [weight];
3400 // Special: they are ignored in this area.
3401 // FIXME: check if it is sane
3402 if (0xF900 <= w && w <= 0xFAD9)
3404 // LAMESPEC: on Windows some of CJK characters
3405 // in 3200-32B0 are incorrectly mapped. They
3406 // mix Chinise and Japanese Kanji when
3407 // ordering those characters.
3409 case 0x32A2: case 0x3298: case 0x3238:
3410 case 0x32A9: case 0x323B: case 0x32AB:
3414 AddCharMapCJK ((char) w, ref category);
3418 // For now it is only for 0x7 category.
3419 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3421 if (map [(int) c].Defined)
3424 bool updateWeight = false;
3425 // Process in advance (lower primary weight)
3426 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3427 if (!map [c2].Defined &&
3428 decompLength [c2] == 1 &&
3429 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3430 switch (decompType [c2]) {
3431 case DecompositionSmall:
3432 updateWeight = true;
3433 AddCharMap ((char) c2, category,
3440 fillIndex [category] = (byte)
3441 (fillIndex [category] + updateCount);
3444 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3445 if (!map [c2].Defined &&
3446 decompLength [c2] == 1 &&
3447 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3448 switch (decompType [c2]) {
3449 case DecompositionSub:
3450 case DecompositionSuper:
3451 case DecompositionWide:
3452 case DecompositionNarrow:
3453 AddCharMap ((char) c2, category,
3461 AddCharMap (c, category, updateCount, level2);
3463 // Since nfkdMap is problematic to have two or more
3464 // NFKD to an identical character, here I iterate all.
3465 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3466 if (!map [c2].Defined &&
3467 decompLength [c2] == 1 &&
3468 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3469 switch (decompType [c2]) {
3470 case DecompositionWide:
3471 case DecompositionNarrow:
3472 case DecompositionSmall:
3473 case DecompositionSub:
3474 case DecompositionSuper:
3477 AddCharMap ((char) c2, category, updateCount, level2);
3484 private void AddArabicCharMap (char c)
3487 byte updateCount = 1;
3491 AddCharMap (c, category, 0, level2);
3493 // Since nfkdMap is problematic to have two or more
3494 // NFKD to an identical character, here I iterate all.
3495 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3496 if (decompLength [c2] == 0)
3498 int idx = decompIndex [c2] + decompLength [c2] - 1;
3499 if ((int) (decompValues [idx]) == (int) c)
3500 AddCharMap ((char) c2, category,
3503 fillIndex [category] += updateCount;
3506 char ToSmallForm (char c)
3508 return ToDecomposed (c, DecompositionSmall, false);
3511 char ToDecomposed (char c, byte d, bool tail)
3513 if (decompType [(int) c] != d)
3515 int idx = decompIndex [(int) c];
3517 idx += decompLength [(int) c] - 1;
3518 return (char) decompValues [idx];
3521 bool ExistsJIS (int cp)
3523 foreach (JISCharacter j in jisJapanese)
3531 #region Level 3 properties (Case/Width)
3533 private byte ComputeLevel3Weight (char c)
3535 byte b = ComputeLevel3WeightRaw (c);
3536 return b > 0 ? (byte) (b + 2) : b;
3539 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3542 if ('\u3192' <= c && c <= '\u319F')
3545 // They have <narrow> NFKD mapping, and on Windows
3546 // those narrow characters are regarded as "normal",
3547 // thus those characters themselves are regarded as
3548 // "wide". grep "<narrow>" and you can pick them up
3549 // (ignoring Kana, Hangul etc.)
3566 if ('\u11A8' <= c && c <= '\u11F9')
3568 if ('\uFFA0' <= c && c <= '\uFFDC')
3570 if ('\u3130' <= c && c <= '\u3164')
3572 if ('\u3165' <= c && c <= '\u318E')
3574 // Georgian Capital letters
3575 if ('\u10A0' <= c && c <= '\u10C5')
3578 if ('\u2776' <= c && c <= '\u277F')
3580 if ('\u2780' <= c && c <= '\u2789')
3582 if ('\u2776' <= c && c <= '\u2793')
3584 if ('\u2160' <= c && c <= '\u216F')
3586 if ('\u2181' <= c && c <= '\u2182')
3589 if ('\u2135' <= c && c <= '\u2138')
3591 byte [] arabicTmp = new byte [] {0x18, 0, 0x8, 0x10};
3592 if ('\uFEB5' <= c && c < '\uFEED' ||
3593 '\uFEF1' <= c && c < '\uFEF5')
3594 return arabicTmp [c % 4];
3595 if ('\uFE80' <= c && c < '\uFF00') {
3596 // 2(Isolated)/8(Final)/0x18(Medial)
3597 switch (decompType [(int) c]) {
3598 case DecompositionIsolated:
3600 case DecompositionFinal:
3602 case DecompositionMedial:
3607 // actually I dunno the reason why they have weights.
3637 switch (decompType [(int) c]) {
3638 case DecompositionWide: // <wide>
3639 case DecompositionSub: // <sub>
3640 case DecompositionSuper: // <super>
3641 ret |= decompType [(int) c];
3644 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3646 if (isUppercase [(int) c]) // DerivedCoreProperties
3656 static bool IsIgnorable (int i)
3658 if (unicodeAge [i] >= 3.1)
3660 switch (char.GetUnicodeCategory ((char) i)) {
3661 case UnicodeCategory.OtherNotAssigned:
3662 case UnicodeCategory.Format:
3669 // FIXME: In the future use DerivedAge.txt to examine character
3670 // versions and set those ones that have higher version than
3671 // 1.0 as ignorable.
3672 static bool IsIgnorable (int i)
3676 // I guess, those characters are added between
3677 // Unicode 1.0 (LCMapString) and Unicode 3.1
3678 // (UnicodeCategory), so they used to be
3679 // something like OtherNotAssigned as of Unicode 1.1.
3680 case 0x2df: case 0x387:
3681 case 0x3d7: case 0x3d8: case 0x3d9:
3682 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3683 case 0x400: case 0x40d: case 0x450: case 0x45d:
3684 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3685 case 0x653: case 0x654: case 0x655: case 0x66d:
3687 case 0x1e9b: case 0x202f: case 0x20ad:
3688 case 0x20ae: case 0x20af:
3689 case 0x20e2: case 0x20e3:
3690 case 0x2139: case 0x213a: case 0x2183:
3691 case 0x2425: case 0x2426: case 0x2619:
3692 case 0x2670: case 0x2671: case 0x3007:
3693 case 0x3190: case 0x3191:
3694 case 0xfffc: case 0xfffd:
3696 // exceptional characters filtered by the
3697 // following conditions. Originally those exceptional
3698 // ranges are incorrect (they should not be ignored)
3699 // and most of those characters are unfortunately in
3701 case 0x4d8: case 0x4d9:
3702 case 0x4e8: case 0x4e9:
3704 case 0x3036: case 0x303f:
3705 case 0x337b: case 0xfb1e:
3710 // The whole Sinhala characters.
3711 0x0D82 <= i && i <= 0x0DF4
3712 // The whole Tibetan characters.
3713 || 0x0F00 <= i && i <= 0x0FD1
3714 // The whole Myanmar characters.
3715 || 0x1000 <= i && i <= 0x1059
3716 // The whole Etiopic, Cherokee,
3717 // Canadian Syllablic, Ogham, Runic,
3718 // Tagalog, Hanunoo, Philippine,
3719 // Buhid, Tagbanwa, Khmer and Mongorian
3721 || 0x1200 <= i && i <= 0x1DFF
3722 // Greek extension characters.
3723 || 0x1F00 <= i && i <= 0x1FFF
3724 // The whole Braille characters.
3725 || 0x2800 <= i && i <= 0x28FF
3726 // CJK radical characters.
3727 || 0x2E80 <= i && i <= 0x2EF3
3728 // Kangxi radical characters.
3729 || 0x2F00 <= i && i <= 0x2FD5
3730 // Ideographic description characters.
3731 || 0x2FF0 <= i && i <= 0x2FFB
3732 // Bopomofo letter and final
3733 || 0x31A0 <= i && i <= 0x31B7
3734 // White square with quadrant characters.
3735 || 0x25F0 <= i && i <= 0x25F7
3736 // Ideographic telegraph symbols.
3737 || 0x32C0 <= i && i <= 0x32CB
3738 || 0x3358 <= i && i <= 0x3370
3739 || 0x33E0 <= i && i <= 0x33FF
3740 // The whole YI characters.
3741 || 0xA000 <= i && i <= 0xA48C
3742 || 0xA490 <= i && i <= 0xA4C6
3743 // American small ligatures
3744 || 0xFB13 <= i && i <= 0xFB17
3745 // hebrew, arabic, variation selector.
3746 || 0xFB1D <= i && i <= 0xFE2F
3747 // Arabic ligatures.
3748 || 0xFEF5 <= i && i <= 0xFEFC
3749 // FIXME: why are they excluded?
3750 || 0x01F6 <= i && i <= 0x01F9
3751 || 0x0218 <= i && i <= 0x0233
3752 || 0x02A9 <= i && i <= 0x02AD
3753 || 0x02EA <= i && i <= 0x02EE
3754 || 0x0349 <= i && i <= 0x036F
3755 || 0x0488 <= i && i <= 0x048F
3756 || 0x04D0 <= i && i <= 0x04FF
3757 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3758 || 0x06D6 <= i && i <= 0x06ED
3759 || 0x06FA <= i && i <= 0x06FE
3760 || 0x2048 <= i && i <= 0x204D
3761 || 0x20e4 <= i && i <= 0x20ea
3762 || 0x213C <= i && i <= 0x214B
3763 || 0x21EB <= i && i <= 0x21FF
3764 || 0x22F2 <= i && i <= 0x22FF
3765 || 0x237B <= i && i <= 0x239A
3766 || 0x239B <= i && i <= 0x23CF
3767 || 0x24EB <= i && i <= 0x24FF
3768 || 0x2596 <= i && i <= 0x259F
3769 || 0x25F8 <= i && i <= 0x25FF
3770 || 0x2672 <= i && i <= 0x2689
3771 || 0x2768 <= i && i <= 0x2775
3772 || 0x27d0 <= i && i <= 0x27ff
3773 || 0x2900 <= i && i <= 0x2aff
3774 || 0x3033 <= i && i <= 0x303F
3775 || 0x31F0 <= i && i <= 0x31FF
3776 || 0x3250 <= i && i <= 0x325F
3777 || 0x32B1 <= i && i <= 0x32BF
3778 || 0x3371 <= i && i <= 0x337B
3779 || 0xFA30 <= i && i <= 0xFA6A
3783 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3785 case UnicodeCategory.PrivateUse:
3786 case UnicodeCategory.Surrogate:
3788 // ignored by nature
3789 case UnicodeCategory.Format:
3790 case UnicodeCategory.OtherNotAssigned:
3797 // To check IsIgnorable sanity, try the driver below under MS.NET.
3800 public static void Main ()
3802 for (int i = 0; i <= char.MaxValue; i++)
3803 Dump (i, IsIgnorable (i));
3806 static void Dump (int i, bool ignore)
3808 switch (Char.GetUnicodeCategory ((char) i)) {
3809 case UnicodeCategory.PrivateUse:
3810 case UnicodeCategory.Surrogate:
3811 return; // check nothing
3815 string s2 = new string ((char) i, 10);
3816 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3817 if ((ret == 0) == ignore)
3819 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3822 #endregion // IsIgnorable
3824 #region IsIgnorableSymbol
3825 static bool IsIgnorableSymbol (int i)
3827 if (IsIgnorable (i))
3832 case 0x00b5: case 0x01C0: case 0x01C1:
3833 case 0x01C2: case 0x01C3: case 0x01F6:
3834 case 0x01F7: case 0x01F8: case 0x01F9:
3835 case 0x02D0: case 0x02EE: case 0x037A:
3836 case 0x03D7: case 0x03F3:
3837 case 0x0400: case 0x040d:
3838 case 0x0450: case 0x045d:
3839 case 0x048C: case 0x048D:
3840 case 0x048E: case 0x048F:
3841 case 0x0587: case 0x0640: case 0x06E5:
3842 case 0x06E6: case 0x06FA: case 0x06FB:
3843 case 0x06FC: case 0x093D: case 0x0950:
3844 case 0x1E9B: case 0x2139: case 0x3006:
3845 case 0x3033: case 0x3034: case 0x3035:
3846 case 0xFE7E: case 0xFE7F:
3848 case 0x16EE: case 0x16EF: case 0x16F0:
3850 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3851 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3852 case 0x3038: // HANGZHOU NUMERAL TEN
3853 case 0x3039: // HANGZHOU NUMERAL TWENTY
3854 case 0x303a: // HANGZHOU NUMERAL THIRTY
3860 case 0x02B9: case 0x02BA: case 0x02C2:
3861 case 0x02C3: case 0x02C4: case 0x02C5:
3862 case 0x02C8: case 0x02CC: case 0x02CD:
3863 case 0x02CE: case 0x02CF: case 0x02D2:
3864 case 0x02D3: case 0x02D4: case 0x02D5:
3865 case 0x02D6: case 0x02D7: case 0x02DE:
3866 case 0x02E5: case 0x02E6: case 0x02E7:
3867 case 0x02E8: case 0x02E9:
3868 case 0x309B: case 0x309C:
3870 case 0x055A: // American Apos
3871 case 0x05C0: // Hebrew Punct
3872 case 0x0E4F: // Thai FONGMAN
3873 case 0x0E5A: // Thai ANGKHANKHU
3874 case 0x0E5B: // Thai KHOMUT
3876 case 0x09F2: // Bengali Rupee Mark
3877 case 0x09F3: // Bengali Rupee Sign
3879 case 0x221e: // INF.
3888 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3890 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3891 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3896 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3898 case UnicodeCategory.Surrogate:
3899 return false; // inconsistent
3901 case UnicodeCategory.SpacingCombiningMark:
3902 case UnicodeCategory.EnclosingMark:
3903 case UnicodeCategory.NonSpacingMark:
3904 case UnicodeCategory.PrivateUse:
3906 if (0x064B <= i && i <= 0x0652) // Arabic
3910 case UnicodeCategory.Format:
3911 case UnicodeCategory.OtherNotAssigned:
3918 // latin in a circle
3919 0x249A <= i && i <= 0x24E9
3920 || 0x2100 <= i && i <= 0x2132
3922 || 0x3196 <= i && i <= 0x31A0
3924 || 0x3200 <= i && i <= 0x321C
3926 || 0x322A <= i && i <= 0x3243
3928 || 0x3260 <= i && i <= 0x32B0
3929 || 0x32D0 <= i && i <= 0x3357
3930 || 0x337B <= i && i <= 0x33DD
3932 use = !Char.IsLetterOrDigit ((char) i);
3936 // This "Digit" rule is mystery.
3937 // It filters some symbols out.
3938 if (Char.IsLetterOrDigit ((char) i))
3940 if (Char.IsNumber ((char) i))
3942 if (Char.IsControl ((char) i)
3943 || Char.IsSeparator ((char) i)
3944 || Char.IsPunctuation ((char) i))
3946 if (Char.IsSymbol ((char) i))
3949 // FIXME: should check more
3954 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3956 public static void Main ()
3958 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3959 for (int i = 0; i <= char.MaxValue; i++) {
3960 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3961 if (uc == UnicodeCategory.Surrogate)
3964 bool ret = IsIgnorableSymbol (i);
3966 string s1 = "TEST ";
3967 string s2 = "TEST " + (char) i;
3969 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3971 if (ret != (result == 0))
3972 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3973 ret ? "should not ignore" :
3982 static bool IsIgnorableNonSpacing (int i)
3984 if (IsIgnorable (i))
3988 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3989 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3990 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3992 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3993 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3994 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3995 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3996 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3997 case 0x0CCD: case 0x0E4E:
4001 if (0x02b9 <= i && i <= 0x02c5
4002 || 0x02cc <= i && i <= 0x02d7
4003 || 0x02e4 <= i && i <= 0x02ef
4004 || 0x20DD <= i && i <= 0x20E0
4008 if (0x064B <= i && i <= 0x00652
4009 || 0x0941 <= i && i <= 0x0948
4010 || 0x0AC1 <= i && i <= 0x0ACD
4011 || 0x0C3E <= i && i <= 0x0C4F
4012 || 0x0E31 <= i && i <= 0x0E3F
4016 return Char.GetUnicodeCategory ((char) i) ==
4017 UnicodeCategory.NonSpacingMark;
4020 // We can reuse IsIgnorableSymbol testcode
4021 // for IsIgnorableNonSpacing.
4027 public byte Category;
4029 public byte Level2; // It is always single byte.
4030 public bool Defined;
4032 public CharMapEntry (byte category, byte level1, byte level2)
4034 Category = category;
4043 public readonly int CP;
4044 public readonly int JIS;
4046 public JISCharacter (int cp, int cpJIS)
4053 class JISComparer : IComparer
4055 public static readonly JISComparer Instance =
4058 public int Compare (object o1, object o2)
4060 JISCharacter j1 = (JISCharacter) o1;
4061 JISCharacter j2 = (JISCharacter) o2;
4062 return j1.JIS - j2.JIS;
4066 class NonJISCharacter
4068 public readonly int CP;
4069 public readonly string Name;
4071 public NonJISCharacter (int cp, string name)
4078 class NonJISComparer : IComparer
4080 public static readonly NonJISComparer Instance =
4081 new NonJISComparer ();
4083 public int Compare (object o1, object o2)
4085 NonJISCharacter j1 = (NonJISCharacter) o1;
4086 NonJISCharacter j2 = (NonJISCharacter) o2;
4087 return string.CompareOrdinal (j1.Name, j2.Name);
4091 class DecimalDictionaryValueComparer : IComparer
4093 public static readonly DecimalDictionaryValueComparer Instance
4094 = new DecimalDictionaryValueComparer ();
4096 private DecimalDictionaryValueComparer ()
4100 public int Compare (object o1, object o2)
4102 DictionaryEntry e1 = (DictionaryEntry) o1;
4103 DictionaryEntry e2 = (DictionaryEntry) o2;
4104 // FIXME: in case of 0, compare decomposition categories
4105 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4108 int i1 = (int) e1.Key;
4109 int i2 = (int) e2.Key;
4114 class StringDictionaryValueComparer : IComparer
4116 public static readonly StringDictionaryValueComparer Instance
4117 = new StringDictionaryValueComparer ();
4119 private StringDictionaryValueComparer ()
4123 public int Compare (object o1, object o2)
4125 DictionaryEntry e1 = (DictionaryEntry) o1;
4126 DictionaryEntry e2 = (DictionaryEntry) o2;
4127 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4130 int i1 = (int) e1.Key;
4131 int i2 = (int) e2.Key;
4136 class UCAComparer : IComparer
4138 public static readonly UCAComparer Instance
4139 = new UCAComparer ();
4141 private UCAComparer ()
4145 public int Compare (object o1, object o2)
4147 char i1 = (char) o1;
4148 char i2 = (char) o2;
4150 int l1 = CollationElementTable.GetSortKeyCount (i1);
4151 int l2 = CollationElementTable.GetSortKeyCount (i2);
4152 int l = l1 > l2 ? l2 : l1;
4154 for (int i = 0; i < l; i++) {
4155 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4156 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4157 int v = k1.Primary - k2.Primary;
4160 v = k1.Secondary - k2.Secondary;
4163 v = k1.Thirtiary - k2.Thirtiary;
4166 v = k1.Quarternary - k2.Quarternary;
4179 ArrayList items = new ArrayList ();
4181 public Tailoring (int lcid)
4186 public Tailoring (int lcid, int alias)
4193 get { return lcid; }
4197 get { return alias; }
4200 public bool FrenchSort {
4201 get { return frenchSort; }
4202 set { frenchSort = value; }
4205 public void AddDiacriticalMap (byte target, byte replace)
4207 items.Add (new DiacriticalMap (target, replace));
4210 public void AddSortKeyMap (string source, byte [] sortkey)
4212 items.Add (new SortKeyMap (source, sortkey));
4215 public void AddReplacementMap (string source, string replace)
4217 items.Add (new ReplacementMap (source, replace));
4220 public char [] ItemToCharArray ()
4222 ArrayList al = new ArrayList ();
4223 foreach (ITailoringMap m in items)
4224 al.AddRange (m.ToCharArray ());
4225 return al.ToArray (typeof (char)) as char [];
4228 interface ITailoringMap
4230 char [] ToCharArray ();
4233 class DiacriticalMap : ITailoringMap
4235 public readonly byte Target;
4236 public readonly byte Replace;
4238 public DiacriticalMap (byte target, byte replace)
4244 public char [] ToCharArray ()
4246 char [] ret = new char [3];
4247 ret [0] = (char) 02; // kind:DiacriticalMap
4248 ret [1] = (char) Target;
4249 ret [2] = (char) Replace;
4254 class SortKeyMap : ITailoringMap
4256 public readonly string Source;
4257 public readonly byte [] SortKey;
4259 public SortKeyMap (string source, byte [] sortkey)
4265 public char [] ToCharArray ()
4267 char [] ret = new char [Source.Length + 7];
4268 ret [0] = (char) 01; // kind:SortKeyMap
4269 for (int i = 0; i < Source.Length; i++)
4270 ret [i + 1] = Source [i];
4272 for (int i = 0; i < 4; i++)
4273 ret [i + Source.Length + 2] = (char) SortKey [i];
4278 class ReplacementMap : ITailoringMap
4280 public readonly string Source;
4281 public readonly string Replace;
4283 public ReplacementMap (string source, string replace)
4289 public char [] ToCharArray ()
4291 char [] ret = new char [Source.Length + Replace.Length + 3];
4292 ret [0] = (char) 03; // kind:ReplaceMap
4294 for (int i = 0; i < Source.Length; i++)
4295 ret [pos++] = Source [i];
4298 for (int i = 0; i < Replace.Length; i++)
4299 ret [pos++] = Replace [i];