3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
102 "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
104 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
105 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
106 "WITH OGONEK;", "WITH CEDILLA;",
108 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
109 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
111 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
112 " DIAERESIS AND GRAVE;",
114 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
115 " MACRON AND ACUTE;",
116 " MACRON AND GRAVE;",
118 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
119 " RING ABOVE AND ACUTE",
120 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
121 " CIRCUMFLEX AND TILDE",
122 " TILDE AND DIAERESIS",
125 " CEDILLA AND BREVE",
126 " OGONEK AND MACRON",
129 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
133 " PRECEDED BY APOSTROPHE",
135 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
138 " RETROFLEX;", "DIAERESIS BELOW",
141 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
142 " BREVE BELOW;", " HORN AND GRAVE",
145 " DOT BELOW AND DOT ABOVE",
146 " RIGHT HALF RING", " HORN AND TILDE",
147 " CIRCUMFLEX AND DOT BELOW",
148 " BREVE AND DOT BELOW",
149 " DOT BELOW AND MACRON",
151 " HORN AND HOOK ABOVE",
153 // CIRCLED, PARENTHESIZED and so on
154 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
155 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
156 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
158 byte [] diacriticWeights = new byte [] {
162 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
163 0x17, 0x19, 0x1A, 0x1B, 0x1C,
165 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
166 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
168 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
169 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
171 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
172 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
174 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
175 0x69, 0x69, 0x6A, 0x6D, 0x6E,
177 // CIRCLED, PARENTHESIZED and so on.
178 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
182 int [] numberSecondaryWeightBounds = new int [] {
183 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
184 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
185 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
186 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
187 0xE50, 0xE60, 0xED0, 0xEE0
190 char [] orderedGurmukhi;
191 char [] orderedGujarati;
192 char [] orderedGeorgian;
193 char [] orderedThaana;
195 static readonly char [] orderedTamilConsonants = new char [] {
196 // based on traditional Tamil consonants, except for
197 // Grantha (where Microsoft breaks traditionalism).
198 // http://www.angelfire.com/empire/thamizh/padanGaL
199 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
200 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
201 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
202 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
205 // cp -> character name (only for some characters)
206 ArrayList sortableCharNames = new ArrayList ();
208 // cp -> arrow value (int)
209 ArrayList arrowValues = new ArrayList ();
211 // cp -> box value (int)
212 ArrayList boxValues = new ArrayList ();
214 // cp -> level1 value
215 Hashtable arabicLetterPrimaryValues = new Hashtable ();
218 Hashtable arabicNameMap = new Hashtable ();
220 // cp -> Hashtable [decompType] -> cp
221 Hashtable nfkdMap = new Hashtable ();
223 // Latin letter -> ArrayList [int]
224 Hashtable latinMap = new Hashtable ();
226 ArrayList jisJapanese = new ArrayList ();
227 ArrayList nonJisJapanese = new ArrayList ();
229 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
230 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
231 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
232 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
233 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
235 byte [] ignorableFlags = new byte [char.MaxValue + 1];
237 static double [] unicodeAge = new double [char.MaxValue + 1];
239 ArrayList tailorings = new ArrayList ();
241 void Run (string [] args)
243 string dirname = args.Length == 0 ? "downloaded" : args [0];
244 ParseSources (dirname);
245 Console.Error.WriteLine ("parse done.");
247 ModifyParsedValues ();
249 Console.Error.WriteLine ("generation done.");
251 Console.Error.WriteLine ("serialization done.");
253 StreamWriter sw = new StreamWriter ("agelog.txt");
254 for (int i = 0; i < char.MaxValue; i++) {
255 bool shouldBe = false;
256 switch (Char.GetUnicodeCategory ((char) i)) {
257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
258 shouldBe = true; break;
260 if (unicodeAge [i] >= 3.1)
262 //if (IsIgnorable (i) != shouldBe)
263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
269 byte [] CompressArray (byte [] source, CodePointIndexer i)
271 return (byte []) CodePointIndexer.CompressArray (
272 source, typeof (byte), i);
275 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
277 return (ushort []) CodePointIndexer.CompressArray (
278 source, typeof (ushort), i);
284 SerializeTailorings ();
286 byte [] categories = new byte [map.Length];
287 byte [] level1 = new byte [map.Length];
288 byte [] level2 = new byte [map.Length];
289 byte [] level3 = new byte [map.Length];
290 ushort [] widthCompat = new ushort [map.Length];
291 for (int i = 0; i < map.Length; i++) {
292 categories [i] = map [i].Category;
293 level1 [i] = map [i].Level1;
294 level2 [i] = map [i].Level2;
295 level3 [i] = ComputeLevel3Weight ((char) i);
296 // For Japanese Half-width characters, don't
297 // map widthCompat. It is IgnoreKanaType that
298 // handles those width differences.
299 if (0xFF6D <= i && i <= 0xFF9D)
301 switch (decompType [i]) {
302 case DecompositionNarrow:
303 case DecompositionWide:
304 case DecompositionSuper:
305 case DecompositionSub:
306 // they are always 1 char
307 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
313 ignorableFlags = CompressArray (ignorableFlags,
314 MSCompatUnicodeTableUtil.Ignorable);
315 categories = CompressArray (categories,
316 MSCompatUnicodeTableUtil.Category);
317 level1 = CompressArray (level1,
318 MSCompatUnicodeTableUtil.Level1);
319 level2 = CompressArray (level2,
320 MSCompatUnicodeTableUtil.Level2);
321 level3 = CompressArray (level3,
322 MSCompatUnicodeTableUtil.Level3);
323 widthCompat = (ushort []) CodePointIndexer.CompressArray (
324 widthCompat, typeof (ushort),
325 MSCompatUnicodeTableUtil.WidthCompat);
326 cjkCHS = CompressArray (cjkCHS,
327 MSCompatUnicodeTableUtil.CjkCHS);
328 cjkCHT = CompressArray (cjkCHT,
329 MSCompatUnicodeTableUtil.Cjk);
330 cjkJA = CompressArray (cjkJA,
331 MSCompatUnicodeTableUtil.Cjk);
332 cjkKO = CompressArray (cjkKO,
333 MSCompatUnicodeTableUtil.Cjk);
334 cjkKOlv2 = CompressArray (cjkKOlv2,
335 MSCompatUnicodeTableUtil.Cjk);
338 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
340 MemoryStream ms = new MemoryStream ();
341 BinaryWriter binary = new BinaryWriter (ms);
342 binary.Write (ignorableFlags.Length);
344 for (int i = 0; i < ignorableFlags.Length; i++) {
345 byte value = ignorableFlags [i];
347 Result.Write ("{0},", value);
349 Result.Write ("0x{0:X02},", value);
351 binary.Write (value);
353 if ((i & 0xF) == 0xF)
354 Result.WriteLine ("// {0:X04}", i - 0xF);
356 Result.WriteLine ("};");
360 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
362 binary.Write (categories.Length);
364 for (int i = 0; i < categories.Length; i++) {
365 byte value = categories [i];
367 Result.Write ("{0},", value);
369 Result.Write ("0x{0:X02},", value);
371 binary.Write (value);
373 if ((i & 0xF) == 0xF)
374 Result.WriteLine ("// {0:X04}", i - 0xF);
376 Result.WriteLine ("};");
379 // Primary weight value
380 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
382 binary.Write (level1.Length);
384 for (int i = 0; i < level1.Length; i++) {
385 byte value = level1 [i];
387 Result.Write ("{0},", value);
389 Result.Write ("0x{0:X02},", value);
391 binary.Write (value);
393 if ((i & 0xF) == 0xF)
394 Result.WriteLine ("// {0:X04}", i - 0xF);
396 Result.WriteLine ("};");
400 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
402 binary.Write (level2.Length);
404 for (int i = 0; i < level2.Length; i++) {
405 byte value = level2 [i];
407 Result.Write ("{0},", value);
409 Result.Write ("0x{0:X02},", value);
411 binary.Write (value);
413 if ((i & 0xF) == 0xF)
414 Result.WriteLine ("// {0:X04}", i - 0xF);
416 Result.WriteLine ("};");
420 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
422 binary.Write (level3.Length);
424 for (int i = 0; i < level3.Length; i++) {
425 byte value = level3 [i];
427 Result.Write ("{0},", value);
429 Result.Write ("0x{0:X02},", value);
431 binary.Write (value);
433 if ((i & 0xF) == 0xF)
434 Result.WriteLine ("// {0:X04}", i - 0xF);
436 Result.WriteLine ("};");
439 // Width insensitivity mappings
440 // (for now it is more lightweight than dumping the
441 // entire NFKD table).
442 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
444 binary.Write (widthCompat.Length);
446 for (int i = 0; i < widthCompat.Length; i++) {
447 ushort value = widthCompat [i];
449 Result.Write ("{0},", value);
451 Result.Write ("0x{0:X02},", value);
453 binary.Write (value);
455 if ((i & 0xF) == 0xF)
456 Result.WriteLine ("// {0:X04}", i - 0xF);
458 Result.WriteLine ("};");
461 using (FileStream fs = File.Create ("../collation.core.bin")) {
462 byte [] array = ms.ToArray ();
463 fs.Write (array, 0, array.Length);
468 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
469 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
470 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
471 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
472 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
475 void SerializeCJK (string name, ushort [] cjk, int max)
477 int offset = 0;//char.MaxValue - cjk.Length;
478 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
480 MemoryStream ms = new MemoryStream ();
481 BinaryWriter binary = new BinaryWriter (ms);
482 binary.Write (cjk.Length);
484 for (int i = 0; i < cjk.Length; i++) {
485 if (i + offset == max)
487 ushort value = cjk [i];
489 Result.Write ("{0},", value);
491 Result.Write ("0x{0:X04},", value);
493 binary.Write (value);
495 if ((i & 0xF) == 0xF)
496 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
498 Result.WriteLine ("};");
501 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
502 byte [] array = ms.ToArray ();
503 fs.Write (array, 0, array.Length);
508 void SerializeCJK (string name, byte [] cjk, int max)
510 int offset = 0;//char.MaxValue - cjk.Length;
511 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
513 MemoryStream ms = new MemoryStream ();
514 BinaryWriter binary = new BinaryWriter (ms);
516 for (int i = 0; i < cjk.Length; i++) {
517 if (i + offset == max)
519 byte value = cjk [i];
521 Result.Write ("{0},", value);
523 Result.Write ("0x{0:X02},", value);
525 binary.Write (value);
527 if ((i & 0xF) == 0xF)
528 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
530 Result.WriteLine ("};");
533 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
534 byte [] array = ms.ToArray ();
535 fs.Write (array, 0, array.Length);
540 void SerializeTailorings ()
542 Hashtable indexes = new Hashtable ();
543 Hashtable counts = new Hashtable ();
544 Result.WriteLine ("static char [] tailorings = new char [] {");
547 MemoryStream ms = new MemoryStream ();
548 BinaryWriter binary = new BinaryWriter (ms);
550 foreach (Tailoring t in tailorings) {
553 Result.Write ("/*{0}*/", t.LCID);
554 indexes.Add (t.LCID, count);
555 char [] values = t.ItemToCharArray ();
556 counts.Add (t.LCID, values.Length);
557 foreach (char c in values) {
558 Result.Write ("'\\x{0:X}', ", (int) c);
559 if (++count % 16 == 0)
560 Result.WriteLine (" // {0:X04}", count - 16);
562 binary.Write ((ushort) c);
566 Result.WriteLine ("};");
568 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
570 byte [] rawdata = ms.ToArray ();
571 ms = new MemoryStream ();
572 binary = new BinaryWriter (ms);
573 binary.Write (tailorings.Count);
575 foreach (Tailoring t in tailorings) {
576 int target = t.Alias != 0 ? t.Alias : t.LCID;
577 if (!indexes.ContainsKey (target)) {
578 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
581 int idx = (int) indexes [target];
582 int cnt = (int) counts [target];
583 bool french = t.FrenchSort;
585 foreach (Tailoring t2 in tailorings)
586 if (t2.LCID == t.LCID)
587 french = t2.FrenchSort;
588 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
590 binary.Write (t.LCID);
593 binary.Write (french);
596 Result.WriteLine ("};");
598 binary.Write ((byte) 0xFF);
599 binary.Write ((byte) 0xFF);
600 binary.Write (rawdata.Length / 2);
601 binary.Write (rawdata, 0, rawdata.Length);
604 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
605 byte [] array = ms.ToArray ();
606 fs.Write (array, 0, array.Length);
613 void ParseSources (string dirname)
616 dirname + "/UnicodeData.txt";
617 string derivedCoreProps =
618 dirname + "/DerivedCoreProperties.txt";
620 dirname + "/Scripts.txt";
622 dirname + "/CP932.TXT";
624 dirname + "/DerivedAge.txt";
625 string chXML = dirname + "/common/collation/zh.xml";
626 string jaXML = dirname + "/common/collation/ja.xml";
627 string koXML = dirname + "/common/collation/ko.xml";
629 ParseDerivedAge (derivedAge);
633 ParseJISOrder (cp932); // in prior to ParseUnidata()
634 ParseUnidata (unidata);
636 ParseDerivedCoreProperties (derivedCoreProps);
637 ParseScripts (scripts);
638 ParseCJK (chXML, jaXML, koXML);
640 ParseTailorings ("mono-tailoring-source.txt");
643 void ParseTailorings (string filename)
647 using (StreamReader sr = new StreamReader (filename)) {
649 while (sr.Peek () >= 0) {
651 ProcessTailoringLine (ref t,
652 sr.ReadLine ().Trim ());
654 } catch (Exception) {
655 Console.Error.WriteLine ("ERROR at line {0}", line);
661 // For now this is enough.
662 string ParseTailoringSourceValue (string s)
664 StringBuilder sb = new StringBuilder ();
665 for (int i = 0; i < s.Length; i++) {
666 if (s.StartsWith ("\\u")) {
667 sb.Append ((char) int.Parse (
668 s.Substring (2, 4), NumberStyles.HexNumber),
675 return sb.ToString ();
678 void ProcessTailoringLine (ref Tailoring t, string s)
680 int idx = s.IndexOf ('#');
682 s = s.Substring (0, idx).Trim ();
683 if (s.Length == 0 || s [0] == '#')
686 idx = s.IndexOf ('=');
689 int.Parse (s.Substring (1, idx - 1)),
690 int.Parse (s.Substring (idx + 1)));
692 t = new Tailoring (int.Parse (s.Substring (1)));
696 if (s.StartsWith ("*FrenchSort")) {
700 string d = "*Diacritical";
701 if (s.StartsWith (d)) {
702 idx = s.IndexOf ("->");
703 t.AddDiacriticalMap (
704 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
705 NumberStyles.HexNumber),
706 byte.Parse (s.Substring (idx + 2).Trim (),
707 NumberStyles.HexNumber));
710 idx = s.IndexOf (':');
712 string source = s.Substring (0, idx).Trim ();
713 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
714 byte [] b = new byte [4];
715 for (int i = 0; i < 4; i++) {
719 b [i] = byte.Parse (l [i],
720 NumberStyles.HexNumber);
722 t.AddSortKeyMap (ParseTailoringSourceValue (source),
725 idx = s.IndexOf ('=');
727 t.AddReplacementMap (
728 ParseTailoringSourceValue (
729 s.Substring (0, idx).Trim ()),
730 ParseTailoringSourceValue (
731 s.Substring (idx + 1).Trim ()));
734 void ParseDerivedAge (string filename)
736 using (StreamReader file =
737 new StreamReader (filename)) {
738 while (file.Peek () >= 0) {
739 string s = file.ReadLine ();
740 int idx = s.IndexOf ('#');
742 s = s.Substring (0, idx);
743 idx = s.IndexOf (';');
747 string cpspec = s.Substring (0, idx);
748 idx = cpspec.IndexOf ("..");
749 NumberStyles nf = NumberStyles.HexNumber |
750 NumberStyles.AllowTrailingWhite;
751 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
752 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
753 string value = s.Substring (cpspec.Length + 1).Trim ();
756 if (cp > char.MaxValue)
759 double v = double.Parse (value);
760 for (int i = cp; i <= cpEnd; i++)
764 unicodeAge [0] = double.MaxValue; // never be supported
767 void ParseUnidata (string filename)
769 ArrayList decompValues = new ArrayList ();
770 using (StreamReader unidata =
771 new StreamReader (filename)) {
772 for (int line = 1; unidata.Peek () >= 0; line++) {
774 ProcessUnidataLine (unidata.ReadLine (), decompValues);
775 } catch (Exception) {
776 Console.Error.WriteLine ("**** At line " + line);
781 this.decompValues = (int [])
782 decompValues.ToArray (typeof (int));
785 char previousLatinTarget = char.MinValue;
786 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
788 void ProcessUnidataLine (string s, ArrayList decompValues)
790 int idx = s.IndexOf ('#');
792 s = s.Substring (0, idx);
793 idx = s.IndexOf (';');
796 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
797 string [] values = s.Substring (idx + 1).Split (';');
800 if (cp > char.MaxValue)
802 if (IsIgnorable (cp))
805 string name = values [0];
807 // SPECIAL CASE: rename some characters for diacritical
808 // remapping. FIXME: why are they different?
809 // FIXME: it's still not working.
810 if (cp == 0x018B || cp == 0x018C)
811 name = name.Replace ("TOPBAR", "STROKE");
814 if (s.IndexOf ("SMALL CAPITAL") > 0)
815 isSmallCapital [cp] = true;
817 // latin mapping by character name
818 if (s.IndexOf ("LATIN") >= 0) {
819 int lidx = s.IndexOf ("LETTER DOTLESS ");
820 int offset = lidx + 15;
822 lidx = s.IndexOf ("LETTER TURNED ");
826 lidx = s.IndexOf ("LETTER CAPITAL ");
830 lidx = s.IndexOf ("LETTER SCRIPT ");
834 lidx = s.IndexOf ("LETTER ");
837 char c = lidx > 0 ? s [offset] : char.MinValue;
838 char n = s [offset + 1];
839 char target = char.MinValue;
840 if ('A' <= c && c <= 'Z' &&
841 (n == ' ') || n == ';') {
843 // FIXME: After 'Z', I cannot reset this state.
844 previousLatinTarget = c == 'Z' ? char.MinValue : c;
847 if (s.Substring (offset).StartsWith ("ALPHA"))
849 else if (s.Substring (offset).StartsWith ("TONE SIX"))
851 else if (s.Substring (offset).StartsWith ("OPEN O"))
853 else if (s.Substring (offset).StartsWith ("SCHWA"))
855 else if (s.Substring (offset).StartsWith ("ENG"))
857 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
859 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
861 else if (s.Substring (offset).StartsWith ("TONE TWO"))
863 else if (s.Substring (offset).StartsWith ("ESH"))
866 if (target == char.MinValue)
867 target = previousLatinTarget;
869 if (target != char.MinValue) {
870 ArrayList entry = (ArrayList) latinMap [target];
872 entry = new ArrayList ();
873 latinMap [target] = entry;
876 // FIXME: This secondary weight is hack.
877 // They are here because they must not
878 // be identical to the corresponding
880 if (c != target && diacritical [cp] == 0) {
881 diacriticalOffset [c - 'A']++;
882 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
888 if (0x2000 <= cp && cp < 0x3000) {
890 // SPECIAL CASES. FIXME: why?
892 case 0x21C5: value = -1; break; // E2
893 case 0x261D: value = 1; break;
894 case 0x27A6: value = 3; break;
895 case 0x21B0: value = 7; break;
896 case 0x21B1: value = 3; break;
897 case 0x21B2: value = 7; break;
898 case 0x21B4: value = 5; break;
899 case 0x21B5: value = 7; break;
900 case 0x21B9: value = -1; break; // E1
901 case 0x21CF: value = 7; break;
902 case 0x21D0: value = 3; break;
904 string [] arrowTargets = new string [] {
916 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
917 if (s.IndexOf (arrowTargets [i]) > 0 &&
918 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
919 s.IndexOf (" OVER") < 0
923 arrowValues.Add (new DictionaryEntry (
928 if (0x2500 <= cp && cp < 0x2600) {
931 // up:1 down:2 right:4 left:8 vert:16 horiz:32
934 // [dr] [dl] [ur] [ul]
938 ArrayList flags = new ArrayList (new int [] {
941 4 + 2, 8 + 2, 4 + 1, 8 + 1,
942 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
943 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
944 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
946 byte [] offsets = new byte [] {
953 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
955 if (s.IndexOf (" UP") >= 0)
957 if (s.IndexOf (" DOWN") >= 0)
959 if (s.IndexOf (" RIGHT") >= 0)
961 if (s.IndexOf (" LEFT") >= 0)
963 if (s.IndexOf (" VERTICAL") >= 0)
965 if (s.IndexOf (" HORIZONTAL") >= 0)
968 int fidx = flags.IndexOf (flag);
969 value = fidx < 0 ? fidx : offsets [fidx];
970 } else if (s.IndexOf ("BLOCK") >= 0) {
971 if (s.IndexOf ("ONE EIGHTH") >= 0)
973 else if (s.IndexOf ("ONE QUARTER") >= 0)
975 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
977 else if (s.IndexOf ("HALF") >= 0)
979 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
981 else if (s.IndexOf ("THREE QUARTERS") >= 0)
983 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
988 else if (s.IndexOf ("SHADE") >= 0)
990 else if (s.IndexOf ("SQUARE") >= 0)
992 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
994 else if (s.IndexOf ("RECTANGLE") >= 0)
996 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
998 else if (s.IndexOf ("TRIANGLE") >= 0) {
999 if (s.IndexOf ("UP-POINTING") >= 0)
1000 value = 0xC0 - 0xE5;
1001 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1002 value = 0xC1 - 0xE5;
1003 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1004 value = 0xC2 - 0xE5;
1005 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1006 value = 0xC3 - 0xE5;
1008 else if (s.IndexOf ("POINTER") >= 0) {
1009 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1010 value = 0xC4 - 0xE5;
1011 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1012 value = 0xC5 - 0xE5;
1014 else if (s.IndexOf ("DIAMOND") >= 0)
1015 value = 0xC6 - 0xE5;
1016 else if (s.IndexOf ("FISHEYE") >= 0)
1017 value = 0xC7 - 0xE5;
1018 else if (s.IndexOf ("LOZENGE") >= 0)
1019 value = 0xC8 - 0xE5;
1020 else if (s.IndexOf ("BULLSEYE") >= 0)
1021 value = 0xC9 - 0xE5;
1022 else if (s.IndexOf ("CIRCLE") >= 0) {
1023 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1024 value = 0xCA - 0xE5;
1025 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1026 value = 0xCB - 0xE5;
1028 value = 0xC9 - 0xE5;
1030 if (0x25DA <= cp && cp <= 0x25E5)
1031 value = 0xCD + cp - 0x25DA - 0xE5;
1033 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1035 case 0x2571: value = 0xF; break;
1036 case 0x2572: value = 0x10; break;
1037 case 0x2573: value = 0x11; break;
1040 boxValues.Add (new DictionaryEntry (
1044 // For some characters store the name and sort later
1045 // to determine sorting.
1046 if (0x2100 <= cp && cp <= 0x213F &&
1047 Char.IsSymbol ((char) cp))
1048 sortableCharNames.Add (
1049 new DictionaryEntry (cp, name));
1050 else if (0x3380 <= cp && cp <= 0x33DD)
1051 sortableCharNames.Add (new DictionaryEntry (
1052 cp, name.Substring (7)));
1054 if (Char.GetUnicodeCategory ((char) cp) ==
1055 UnicodeCategory.MathSymbol) {
1056 if (name.StartsWith ("CIRCLED "))
1057 diacritical [cp] = 0xEE;
1058 if (name.StartsWith ("SQUARED "))
1059 diacritical [cp] = 0xEF;
1062 // diacritical weights by character name
1063 if (diacritics.Length != diacriticWeights.Length)
1064 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1065 for (int d = 0; d < diacritics.Length; d++) {
1066 if (s.IndexOf (diacritics [d]) > 0) {
1067 diacritical [cp] += diacriticWeights [d];
1068 if (s.IndexOf ("COMBINING") >= 0)
1069 diacritical [cp] -= (byte) 2;
1072 // also process "COMBINING blah" here
1073 // For now it is limited to cp < 0x0370
1074 // if (cp < 0x0300 || cp >= 0x0370)
1076 string tmp = diacritics [d].TrimEnd (';');
1077 if (tmp.IndexOf ("WITH ") == 0)
1078 tmp = tmp.Substring (4);
1079 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1081 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1083 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1085 // Two-step grep required for it.
1086 if (s.IndexOf ("FULL STOP") > 0 &&
1087 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1088 diacritical [cp] |= 0xF4;
1090 // Arabic letter name
1091 if (0x0621 <= cp && cp <= 0x064A &&
1092 Char.GetUnicodeCategory ((char) cp)
1093 == UnicodeCategory.OtherLetter) {
1094 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1099 // hamza, waw, yeh ... special cases.
1104 value = 0x77; // special cases.
1107 // Get primary letter name i.e.
1108 // XXX part of ARABIC LETTER XXX yyy
1109 // e.g. that of "TEH MARBUTA" is "TEH".
1112 // 0x0640 is special: it does
1113 // not start with ARABIC LETTER
1115 name.Substring (14);
1116 int tmpIdx = letterName.IndexOf (' ');
1117 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1118 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1119 if (arabicNameMap.ContainsKey (letterName))
1120 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1122 arabicNameMap [letterName] = cp;
1125 arabicLetterPrimaryValues [cp] = value;
1128 // Japanese square letter
1129 if (0x3300 <= cp && cp <= 0x3357)
1130 if (!ExistsJIS (cp))
1131 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1133 // normalizationType
1134 string decomp = values [4];
1135 idx = decomp.IndexOf ('<');
1137 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1139 decompType [cp] = DecompositionFull;
1142 decompType [cp] = DecompositionSub;
1145 decompType [cp] = DecompositionSuper;
1148 decompType [cp] = DecompositionSmall;
1151 decompType [cp] = DecompositionIsolated;
1154 decompType [cp] = DecompositionInitial;
1157 decompType [cp] = DecompositionFinal;
1160 decompType [cp] = DecompositionMedial;
1163 decompType [cp] = DecompositionNoBreak;
1166 decompType [cp] = DecompositionCompat;
1169 decompType [cp] = DecompositionFraction;
1172 decompType [cp] = DecompositionFont;
1175 decompType [cp] = DecompositionCircle;
1178 decompType [cp] = DecompositionSquare;
1181 decompType [cp] = DecompositionWide;
1184 decompType [cp] = DecompositionNarrow;
1187 decompType [cp] = DecompositionVertical;
1190 throw new Exception ("Support NFKD type : " + decomp);
1194 decompType [cp] = DecompositionCanonical;
1195 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1196 if (decomp.Length > 0) {
1198 string [] velems = decomp.Split (' ');
1199 int didx = decompValues.Count;
1200 decompIndex [cp] = didx;
1201 foreach (string v in velems)
1202 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1203 decompLength [cp] = velems.Length;
1205 // [decmpType] -> this_cp
1206 int targetCP = (int) decompValues [didx];
1207 // for "(x)" it specially maps to 'x' .
1208 // FIXME: check if it is sane
1209 if (velems.Length == 3 &&
1210 (int) decompValues [didx] == '(' &&
1211 (int) decompValues [didx + 2] == ')')
1212 targetCP = (int) decompValues [didx + 1];
1213 // special: 0x215F "1/"
1214 else if (cp == 0x215F)
1216 else if (velems.Length > 1 &&
1217 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1218 // skip them, except for CJK ideograph compat
1221 if (targetCP != 0) {
1222 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1223 if (entry == null) {
1224 entry = new Hashtable ();
1225 nfkdMap [targetCP] = entry;
1227 entry [(byte) decompType [cp]] = cp;
1231 if (values [5].Length > 0)
1232 decimalValue [cp] = decimal.Parse (values [5]);
1233 else if (values [6].Length > 0)
1234 decimalValue [cp] = decimal.Parse (values [6]);
1235 else if (values [7].Length > 0) {
1236 string decstr = values [7];
1237 idx = decstr.IndexOf ('/');
1238 if (cp == 0x215F) // special. "1/"
1239 decimalValue [cp] = 0x1;
1243 decimal.Parse (decstr.Substring (0, idx))
1244 / decimal.Parse (decstr.Substring (idx + 1));
1245 else if (decstr [0] == '(' &&
1246 decstr [decstr.Length - 1] == ')')
1249 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1250 else if (decstr [decstr.Length - 1] == '.')
1253 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1255 decimalValue [cp] = decimal.Parse (decstr);
1259 void ParseDerivedCoreProperties (string filename)
1262 using (StreamReader file =
1263 new StreamReader (filename)) {
1264 for (int line = 1; file.Peek () >= 0; line++) {
1266 ProcessDerivedCorePropLine (file.ReadLine ());
1267 } catch (Exception) {
1268 Console.Error.WriteLine ("**** At line " + line);
1275 void ProcessDerivedCorePropLine (string s)
1277 int idx = s.IndexOf ('#');
1279 s = s.Substring (0, idx);
1280 idx = s.IndexOf (';');
1283 string cpspec = s.Substring (0, idx);
1284 idx = cpspec.IndexOf ("..");
1285 NumberStyles nf = NumberStyles.HexNumber |
1286 NumberStyles.AllowTrailingWhite;
1287 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1288 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1289 string value = s.Substring (cpspec.Length + 1).Trim ();
1292 if (cp > char.MaxValue)
1297 for (int x = cp; x <= cpEnd; x++)
1298 isUppercase [x] = true;
1303 void ParseScripts (string filename)
1305 ArrayList gurmukhi = new ArrayList ();
1306 ArrayList gujarati = new ArrayList ();
1307 ArrayList georgian = new ArrayList ();
1308 ArrayList thaana = new ArrayList ();
1310 using (StreamReader file =
1311 new StreamReader (filename)) {
1312 while (file.Peek () >= 0) {
1313 string s = file.ReadLine ();
1314 int idx = s.IndexOf ('#');
1316 s = s.Substring (0, idx);
1317 idx = s.IndexOf (';');
1321 string cpspec = s.Substring (0, idx);
1322 idx = cpspec.IndexOf ("..");
1323 NumberStyles nf = NumberStyles.HexNumber |
1324 NumberStyles.AllowTrailingWhite;
1325 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1326 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1327 string value = s.Substring (cpspec.Length + 1).Trim ();
1330 if (cp > char.MaxValue)
1335 for (int x = cp; x <= cpEnd; x++)
1336 if (!IsIgnorable (x))
1337 gurmukhi.Add ((char) x);
1340 for (int x = cp; x <= cpEnd; x++)
1341 if (!IsIgnorable (x))
1342 gujarati.Add ((char) x);
1345 for (int x = cp; x <= cpEnd; x++)
1346 if (!IsIgnorable (x))
1347 georgian.Add ((char) x);
1350 for (int x = cp; x <= cpEnd; x++)
1351 if (!IsIgnorable (x))
1352 thaana.Add ((char) x);
1357 gurmukhi.Sort (UCAComparer.Instance);
1358 gujarati.Sort (UCAComparer.Instance);
1359 georgian.Sort (UCAComparer.Instance);
1360 thaana.Sort (UCAComparer.Instance);
1361 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1362 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1363 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1364 orderedThaana = (char []) thaana.ToArray (typeof (char));
1367 void ParseJISOrder (string filename)
1371 using (StreamReader file =
1372 new StreamReader (filename)) {
1373 for (;file.Peek () >= 0; line++)
1374 ProcessJISOrderLine (file.ReadLine ());
1376 } catch (Exception) {
1377 Console.Error.WriteLine ("---- line {0}", line);
1382 char [] ws = new char [] {'\t', ' '};
1384 void ProcessJISOrderLine (string s)
1386 int idx = s.IndexOf ('#');
1388 s = s.Substring (0, idx).Trim ();
1391 idx = s.IndexOfAny (ws);
1394 // They start with "0x" so cut them out.
1395 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1396 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1397 jisJapanese.Add (new JISCharacter (cp, jis));
1400 void ParseCJK (string zhXML, string jaXML, string koXML)
1402 XmlDocument doc = new XmlDocument ();
1403 doc.XmlResolver = null;
1410 // Chinese Simplified
1413 offset = 0;//char.MaxValue - arr.Length;
1415 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1417 foreach (char c in s) {
1419 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1421 arr [(int) c - offset] = (ushort) v++;
1427 // Chinese Traditional
1430 offset = 0;//char.MaxValue - arr.Length;
1431 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1433 foreach (char c in s) {
1435 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1437 arr [(int) c - offset] = (ushort) v++;
1446 offset = 0;//char.MaxValue - arr.Length;
1449 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1450 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1451 arr [0x337E] = 0x8005;
1452 arr [0x337D] = 0x8006;
1453 arr [0x337C] = 0x8007;
1456 foreach (JISCharacter jc in jisJapanese) {
1457 if (jc.JIS < 0x8800)
1459 char c = (char) jc.CP;
1462 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1464 arr [(int) c - offset] = (ushort) v++;
1469 if (c == '\u662D') // U+337C
1471 if (c == '\u5927') // U+337D
1473 if (c == '\u5E73') // U+337B
1475 if (c == '\u660E') // U+337E
1477 if (c == '\u9686') // U+F9DC
1480 // FIXME: there are still remaining
1481 // characters after U+FA0C.
1482 // for (int k = 0; k < char.MaxValue; k++) {
1483 for (int k = 0; k < '\uFA0D'; k++) {
1484 if (decompIndex [k] == 0 || IsIgnorable (k))
1486 if (decompValues [decompIndex [k]] == c /*&&
1487 decompLength [k] == 1*/ ||
1488 decompLength [k] == 3 &&
1489 decompValues [decompIndex [k] + 1] == c) {
1490 arr [k - offset] = (ushort) v++;
1499 // Korean weight is somewhat complex. It first shifts
1500 // Hangul category from 52-x to 80-x (they are anyways
1501 // computed). CJK ideographs are placed at secondary
1502 // weight, like XX YY 01 zz 01, where XX and YY are
1503 // corresponding "reset" value and zz is 41,43,45...
1505 // Unlike chs,cht and ja, Korean value is a combined
1506 // ushort which is computed as category
1510 offset = 0;//char.MaxValue - arr.Length;
1512 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1513 XmlElement sc = (XmlElement) reset.NextSibling;
1514 // compute "category" and "level 1" for the
1515 // target "reset" Hangle syllable
1516 char rc = reset.InnerText [0];
1517 int ri = ((int) rc - 0xAC00) + 1;
1519 ((ri / 254) * 256 + (ri % 254) + 2);
1520 // Place the characters after the target.
1523 foreach (char c in s) {
1524 arr [(int) c - offset] = p;
1525 cjkKOlv2 [(int) c - offset] = (byte) v;
1535 void FillIgnorables ()
1537 for (int i = 0; i <= char.MaxValue; i++) {
1538 if (Char.GetUnicodeCategory ((char) i) ==
1539 UnicodeCategory.OtherNotAssigned)
1541 if (IsIgnorable (i))
1542 ignorableFlags [i] |= 1;
1543 if (IsIgnorableSymbol (i))
1544 ignorableFlags [i] |= 2;
1545 if (IsIgnorableNonSpacing (i))
1546 ignorableFlags [i] |= 4;
1550 void ModifyUnidata ()
1552 // Modify some decomposition equivalence
1553 decompType [0xFE31] = 0;
1554 decompIndex [0xFE31] = 0;
1555 decompLength [0xFE31] = 0;
1556 decompType [0xFE32] = 0;
1557 decompIndex [0xFE32] = 0;
1558 decompLength [0xFE32] = 0;
1560 // Korean parens numbers
1561 for (int i = 0x3200; i <= 0x321C; i++)
1562 diacritical [i] = 0xA;
1563 for (int i = 0x3260; i <= 0x327B; i++)
1564 diacritical [i] = 0xC;
1566 // LAMESPEC: these remapping should not be done.
1567 // Windows have incorrect CJK compat mappings.
1568 decompValues [decompIndex [0x32A9]] = 0x91AB;
1569 decompLength [0x323B] = 1;
1570 decompValues [decompIndex [0x323B]] = 0x5B78;
1571 decompValues [decompIndex [0x32AB]] = 0x5B78;
1572 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1573 decompLength [0x3238] = 1;
1574 decompValues [decompIndex [0x3238]] = 0x52DE;
1575 decompValues [decompIndex [0x3298]] = 0x52DE;
1577 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1578 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1579 decompValues [decompIndex [0xFA0C]] = 0x5140;
1580 decompLength [0xFA0C] = 1;
1581 decompIndex [0xF929] = decompLength [0xF929] = 0;
1583 decompValues [decompIndex [0xF92C]] = 0x90DE;
1586 void ModifyParsedValues ()
1588 // number, secondary weights
1590 int [] numarr = numberSecondaryWeightBounds;
1591 for (int i = 0; i < numarr.Length; i += 2, weight++)
1592 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1593 if (Char.IsNumber ((char) cp))
1594 diacritical [cp] = weight;
1596 // Update name part of named characters
1597 for (int i = 0; i < sortableCharNames.Count; i++) {
1598 DictionaryEntry de =
1599 (DictionaryEntry) sortableCharNames [i];
1600 int cp = (int) de.Key;
1601 string renamed = null;
1603 case 0x2101: renamed = "A_1"; break;
1604 case 0x33C3: renamed = "A_2"; break;
1605 case 0x2105: renamed = "C_1"; break;
1606 case 0x2106: renamed = "C_2"; break;
1607 case 0x211E: renamed = "R1"; break;
1608 case 0x211F: renamed = "R2"; break;
1609 // Remove some of them!
1620 sortableCharNames.RemoveAt (i);
1624 if (renamed != null)
1625 sortableCharNames [i] =
1626 new DictionaryEntry (cp, renamed);
1630 void GenerateCore ()
1634 #region Specially ignored // 01
1635 // This will raise "Defined" flag up.
1636 foreach (char c in specialIgnore)
1637 map [(int) c] = new CharMapEntry (0, 0, 0);
1641 #region Variable weights
1642 // Controls : 06 03 - 06 3D
1644 for (int i = 0; i < 65536; i++) {
1645 if (IsIgnorable (i))
1648 uc = Char.GetUnicodeCategory (c);
1649 // NEL is whitespace but not ignored here.
1650 if (uc == UnicodeCategory.Control &&
1651 !Char.IsWhiteSpace (c) || c == '\u0085')
1652 AddCharMap (c, 6, 1);
1656 fillIndex [6] = 0x80;
1657 AddCharMapGroup ('\'', 6, 1, 0);
1658 AddCharMap ('\uFE63', 6, 1);
1660 // Hyphen/Dash : 06 81 - 06 90
1661 for (int i = 0; i < char.MaxValue; i++) {
1662 if (!IsIgnorable (i) &&
1663 Char.GetUnicodeCategory ((char) i) ==
1664 UnicodeCategory.DashPunctuation) {
1665 AddCharMapGroup2 ((char) i, 6, 1, 0);
1667 // SPECIAL: add 2027 and 2043
1668 // Maybe they are regarded the
1669 // same hyphens in "central"
1671 AddCharMap ('\u2027', 6, 1);
1672 AddCharMap ('\u2043', 6, 1);
1677 // Arabic variable weight chars 06 A0 -
1678 fillIndex [6] = 0xA0;
1680 for (int i = 0x64B; i <= 0x650; i++)
1681 AddArabicCharMap ((char) i);
1683 AddCharMapGroup ('\u0652', 6, 1, 0);
1685 AddCharMapGroup ('\u0651', 6, 1, 0);
1689 #region Nonspacing marks // 01
1690 // FIXME: 01 03 - 01 B6 ... annoyance :(
1692 // Combining diacritical marks: 01 DC -
1694 fillIndex [0x1] = 0x41;
1695 for (int i = 0x030E; i <= 0x0326; i++)
1696 if (!IsIgnorable (i))
1697 AddCharMap ((char) i, 0x1, 1);
1698 for (int i = 0x0329; i <= 0x0334; i++)
1699 if (!IsIgnorable (i))
1700 AddCharMap ((char) i, 0x1, 1);
1701 for (int i = 0x0339; i <= 0x0341; i++)
1702 if (!IsIgnorable (i))
1703 AddCharMap ((char) i, 0x1, 1);
1704 fillIndex [0x1] = 0x72;
1705 for (int i = 0x0346; i <= 0x0348; i++)
1706 if (!IsIgnorable (i))
1707 AddCharMap ((char) i, 0x1, 1);
1708 for (int i = 0x02BE; i <= 0x02BF; i++)
1709 if (!IsIgnorable (i))
1710 AddCharMap ((char) i, 0x1, 1);
1711 for (int i = 0x02C1; i <= 0x02C5; i++)
1712 if (!IsIgnorable (i))
1713 AddCharMap ((char) i, 0x1, 1);
1714 for (int i = 0x02CE; i <= 0x02CF; i++)
1715 if (!IsIgnorable (i))
1716 AddCharMap ((char) i, 0x1, 1);
1717 for (int i = 0x02D1; i <= 0x02D3; i++)
1718 if (!IsIgnorable (i))
1719 AddCharMap ((char) i, 0x1, 1);
1720 AddCharMap ('\u02DE', 0x1, 1);
1721 for (int i = 0x02E4; i <= 0x02E9; i++)
1722 if (!IsIgnorable (i))
1723 AddCharMap ((char) i, 0x1, 1);
1725 // FIXME: needs more love here (it should eliminate
1726 // all the hacky code above).
1727 for (int i = 0x0300; i < 0x0370; i++)
1728 if (!IsIgnorable (i) && diacritical [i] != 0
1729 /* especiall here*/ && !map [i].Defined)
1730 map [i] = new CharMapEntry (
1731 0x1, 0x1, diacritical [i]);
1733 fillIndex [0x1] = 0x94;
1734 // syriac dotted nonspacing marks
1735 AddCharMap ('\u0732', 0x1, 1);
1736 AddCharMap ('\u0735', 0x1, 1);
1737 AddCharMap ('\u0738', 0x1, 1);
1738 AddCharMap ('\u0739', 0x1, 1);
1739 AddCharMap ('\u073C', 0x1, 1);
1740 fillIndex [0x1] = 0x9F;
1741 for (int i = 0x0730; i <= 0x07B0; i++)
1742 if (!IsIgnorable (i) && !map [i].Defined)
1743 AddCharMap ((char) i, 0x1, 1);
1745 fillIndex [0x1] = 0x0C;
1746 for (int i = 0x0EC8; i <= 0x0ECD; i++)
1747 if (!IsIgnorable (i))
1748 AddCharMap ((char) i, 0x1, 1);
1750 // LAMESPEC: It should not stop at '\u20E1'. There are
1751 // a few more characters (that however results in
1752 // overflow of level 2 unless we start before 0xDD).
1753 fillIndex [0x1] = 0xDD;
1754 for (int i = 0x20d0; i <= 0x20e1; i++)
1755 AddCharMap ((char) i, 0x1, 1);
1757 // They are not part of Nonspacing marks, but have
1758 // only diacritical weight.
1759 for (int i = 0x3099; i <= 0x309C; i++)
1760 map [i] = new CharMapEntry (1, 1, 1);
1761 map [0xFF9E] = new CharMapEntry (1, 1, 1);
1762 map [0xFF9F] = new CharMapEntry (1, 1, 2);
1763 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1764 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1765 for (int i = 0x30FC; i <= 0x30FE; i++)
1766 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1771 #region Whitespaces // 07 03 -
1772 fillIndex [0x7] = 0x2;
1773 AddCharMap (' ', 0x7, 2);
1774 AddCharMap ('\u00A0', 0x7, 1);
1775 for (int i = 9; i <= 0xD; i++)
1776 AddCharMap ((char) i, 0x7, 1);
1777 for (int i = 0x2000; i <= 0x200B; i++)
1778 AddCharMap ((char) i, 0x7, 1);
1780 fillIndex [0x7] = 0x17;
1781 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1782 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1784 // Characters which used to represent layout control.
1785 // LAMESPEC: Windows developers seem to have thought
1786 // that those characters are kind of whitespaces,
1787 // while they aren't.
1788 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1789 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1792 // category 09 - continued symbols from 08
1793 fillIndex [0x9] = 2;
1795 for (int cp = 0x2300; cp <= 0x237A; cp++)
1796 AddCharMap ((char) cp, 0x9, 1, 0);
1799 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1800 foreach (DictionaryEntry de in arrowValues) {
1801 int idx = (int) de.Value;
1802 int cp = (int) de.Key;
1803 if (map [cp].Defined)
1805 fillIndex [0x9] = (byte) (0xD8 + idx);
1806 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1810 byte [] boxLv2 = new byte [128];
1811 for (int i = 0; i < boxLv2.Length; i++)
1813 foreach (DictionaryEntry de in boxValues) {
1814 int cp = (int) de.Key;
1815 int off = (int) de.Value;
1816 if (map [cp].Defined)
1819 fillIndex [0x9] = (byte) (0xE5 + off);
1820 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1823 fillIndex [0x9] = (byte) (0xE5 + off);
1824 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1827 // Some special characters (slanted)
1828 fillIndex [0x9] = 0xF4;
1829 AddCharMap ('\u2571', 0x9, 3);
1830 AddCharMap ('\u2572', 0x9, 3);
1831 AddCharMap ('\u2573', 0x9, 3);
1833 // FIXME: implement 0A
1835 fillIndex [0xA] = 2;
1836 // byte currency symbols
1837 for (int cp = 0; cp < 0x100; cp++) {
1838 uc = Char.GetUnicodeCategory ((char) cp);
1839 if (!IsIgnorable (cp) &&
1840 uc == UnicodeCategory.CurrencySymbol &&
1843 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1845 // byte other symbols
1846 for (int cp = 0; cp < 0x100; cp++) {
1848 continue; // SPECIAL: skip FIXME: why?
1849 uc = Char.GetUnicodeCategory ((char) cp);
1850 if (!IsIgnorable (cp) &&
1851 uc == UnicodeCategory.OtherSymbol ||
1852 cp == '\u00B5' || cp == '\u00B7')
1853 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1856 fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1857 for (int cp = 0x2020; cp <= 0x2031; cp++)
1858 if (Char.IsPunctuation ((char) cp))
1859 AddCharMap ((char) cp, 0xA, 1, 0);
1860 // SPECIAL CASES: why?
1861 AddCharMap ('\u203B', 0xA, 1, 0);
1862 AddCharMap ('\u2040', 0xA, 1, 0);
1863 AddCharMap ('\u2041', 0xA, 1, 0);
1864 AddCharMap ('\u2042', 0xA, 1, 0);
1866 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1867 AddCharMap ((char) cp, 0xA, 1, 0);
1868 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1869 for (int cp = 0x2600; cp <= 0x2613; cp++)
1870 AddCharMap ((char) cp, 0xA, 1, 0);
1872 for (int cp = 0x2620; cp <= 0x2770; cp++)
1873 if (Char.IsSymbol ((char) cp))
1874 AddCharMap ((char) cp, 0xA, 1, 0);
1876 for (int i = 0x2440; i < 0x2460; i++)
1877 AddCharMap ((char) i, 0xA, 1, 0);
1881 #region Numbers // 0C 02 - 0C E1
1882 fillIndex [0xC] = 2;
1884 // 9F8 : Bengali "one less than the denominator"
1885 AddCharMap ('\u09F8', 0xC, 1);
1887 ArrayList numbers = new ArrayList ();
1888 for (int i = 0; i < 65536; i++)
1889 if (!IsIgnorable (i) &&
1890 Char.IsNumber ((char) i) &&
1891 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1894 ArrayList numberValues = new ArrayList ();
1895 foreach (int i in numbers)
1896 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1897 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1899 //foreach (DictionaryEntry de in numberValues)
1900 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1902 decimal prevValue = -1;
1903 foreach (DictionaryEntry de in numberValues) {
1904 int cp = (int) de.Key;
1905 decimal currValue = (decimal) de.Value;
1906 bool addnew = false;
1907 if (prevValue < currValue &&
1908 prevValue - (int) prevValue == 0 &&
1912 // Process Hangzhou and Roman numbers
1914 // There are some SPECIAL cases.
1915 if (currValue != 4) // no increment for 4
1919 if (currValue <= 10) {
1920 xcp = (int) prevValue + 0x2170 - 1;
1921 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1922 xcp = (int) prevValue + 0x2160 - 1;
1923 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1924 fillIndex [0xC] += 2;
1925 xcp = (int) prevValue + 0x3021 - 1;
1926 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1929 else if (currValue == 11)
1932 if (prevValue < currValue)
1933 prevValue = currValue;
1934 if (map [cp].Defined)
1936 // HangZhou and Roman are add later
1938 else if (0x3021 <= cp && cp < 0x302A
1939 || 0x2160 <= cp && cp < 0x216A
1940 || 0x2170 <= cp && cp < 0x217A)
1943 if (cp == 0x215B) // FIXME: why?
1944 fillIndex [0xC] += 2;
1945 else if (cp == 0x3021) // FIXME: why?
1947 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1948 if (addnew || cp <= '9') {
1949 int mod = (int) currValue - 1;
1951 if (1 <= currValue && currValue <= 10) {
1953 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1955 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1957 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1959 if (1 <= currValue && currValue <= 20) {
1961 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1963 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1965 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1969 if (cp != 0x09E7 && cp != 0x09EA)
1972 // Add special cases that are not regarded as
1973 // numbers in UnicodeCategory speak.
1976 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1977 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1979 else if (cp == '6') // FIXME: why?
1984 fillIndex [0xC] = 0xFF;
1985 AddCharMap ('\u221E', 0xC, 1);
1988 #region Letters and NonSpacing Marks (general)
1990 // ASCII Latin alphabets
1991 for (int i = 0; i < alphabets.Length; i++)
1992 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1995 // non-ASCII Latin alphabets
1996 // FIXME: there is no such characters that are placed
1997 // *after* "alphabets" array items. This is nothing
1998 // more than a hack that creates dummy weight for
1999 // primary characters.
2000 for (int i = 0x0080; i < 0x0300; i++) {
2001 if (!Char.IsLetter ((char) i))
2003 // For those Latin Letters which has NFKD are
2004 // not added as independent primary character.
2005 if (decompIndex [i] != 0)
2008 // 1.some alphabets have primarily
2009 // equivalent ASCII alphabets.
2010 // 2.some have independent primary weights,
2011 // but inside a-to-z range.
2012 // 3.there are some expanded characters that
2013 // are not part of Unicode Standard NFKD.
2014 // 4. some characters are letter in IsLetter
2015 // but not in sortkeys (maybe unicode version
2016 // difference caused it).
2018 // 1. skipping them does not make sense
2019 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2020 // case 0x184: case 0x185: case 0x186: case 0x189:
2021 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2022 // case 0x194: case 0x195: case 0x196: case 0x19A:
2023 // case 0x19B: case 0x19C:
2024 // 2. skipping them does not make sense
2025 // case 0x14A: // Ng
2026 // case 0x14B: // ng
2030 case 0xDE: // Icelandic Thorn
2031 case 0xFE: // Icelandic Thorn
2032 case 0xDF: // German ss
2033 case 0xFF: // German ss
2035 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2036 // not classified yet
2037 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2038 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2039 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2043 AddCharMapGroup ((char) i, 0xE, 1, 0);
2047 fillIndex [0xF] = 02;
2048 for (int i = 0x0380; i < 0x0390; i++)
2049 if (Char.IsLetter ((char) i))
2050 AddLetterMap ((char) i, 0xF, 1);
2051 fillIndex [0xF] = 02;
2052 for (int i = 0x0391; i < 0x03CF; i++)
2053 if (Char.IsLetter ((char) i))
2054 AddLetterMap ((char) i, 0xF, 1);
2055 fillIndex [0xF] = 0x40;
2056 for (int i = 0x03D0; i < 0x0400; i++)
2057 if (Char.IsLetter ((char) i))
2058 AddLetterMap ((char) i, 0xF, 1);
2061 // Cyrillic letters are sorted like Latin letters i.e.
2062 // containing culture-specific letters between the
2063 // standard Cyrillic sequence.
2065 // We can't use UCA here; it has different sorting.
2066 char [] orderedCyrillic = new char [] {
2067 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2068 '\u0452', // DJE for Serbocroatian
2070 '\u0454', // IE for Ukrainian
2074 '\u0456', // Byelorussian-Ukrainian I
2084 '\u043F', '\u0440', '\u0441', '\u0442',
2085 '\u045B', // TSHE for Serbocroatian
2087 '\u045E', // Short U for Byelorussian
2088 '\u04B1', // Straight U w/ stroke (diacritical!)
2089 '\u0444', '\u0445', '\u0446', '\u0447',
2091 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2092 '\u044D', '\u044E', '\u044F'};
2094 // For some characters here is a map to basic cyrillic
2095 // letters. See UnicodeData.txt character names for
2096 // the sources. Here I simply declare an equiv. array.
2097 // The content characters are map from U+490(,491),
2098 // skipping small letters.
2099 char [] cymap_src = new char [] {
2100 '\u0433', '\u0433', '\u0433', '\u0436',
2101 '\u0437', '\u043A', '\u043A', '\u043A',
2102 '\u043A', '\u043D', '\u043D', '\u043F',
2103 '\u0445', '\u0441', '\u0442', '\u0443',
2104 '\u0443', '\u0445', '\u0446', '\u0447',
2105 '\u0447', '\u0432', '\u0435', '\u0435',
2106 '\u0406', '\u0436', '\u043A', '\u043D',
2107 '\u0447', '\u0435'};
2109 fillIndex [0x10] = 0x8D;
2110 for (int i = 0x0460; i < 0x0481; i++) {
2111 if (Char.IsLetter ((char) i)) {
2113 // U+476/477 have the same
2114 // primary weight as U+474/475.
2115 fillIndex [0x10] -= 3;
2116 AddLetterMap ((char) i, 0x10, 3);
2120 fillIndex [0x10] = 0x6;
2121 for (int i = 0; i < orderedCyrillic.Length; i++) {
2122 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2123 if (!IsIgnorable ((int) c) &&
2124 Char.IsLetter (c) &&
2126 AddLetterMap (c, 0x10, 0);
2127 fillIndex [0x10] += 3;
2131 for (int i = 0; i < cymap_src.Length; i++) {
2132 char c = cymap_src [i];
2133 fillIndex [0x10] = map [c].Level1;
2134 AddLetterMap ((char) (0x0490 + i * 2),
2139 fillIndex [0x11] = 0x3;
2140 for (int i = 0x0531; i < 0x0586; i++)
2141 if (Char.IsLetter ((char) i))
2142 AddLetterMap ((char) i, 0x11, 1);
2146 fillIndex [0x12] = 0x2;
2147 for (int i = 0x05D0; i < 0x05FF; i++)
2148 if (Char.IsLetter ((char) i))
2149 AddLetterMap ((char) i, 0x12, 1);
2151 fillIndex [0x1] = 0x3;
2152 for (int i = 0x0591; i <= 0x05C2; i++) {
2153 if (i == 0x05A3 || i == 0x05BB)
2156 AddCharMap ((char) i, 0x1, 1);
2160 fillIndex [0x1] = 0x8E;
2161 fillIndex [0x13] = 0x3;
2162 for (int i = 0x0621; i <= 0x064A; i++) {
2164 if (Char.GetUnicodeCategory ((char) i)
2165 != UnicodeCategory.OtherLetter) {
2166 // FIXME: arabic nonspacing marks are
2167 // in different order.
2168 AddCharMap ((char) i, 0x1, 1);
2171 // map [i] = new CharMapEntry (0x13,
2172 // (byte) arabicLetterPrimaryValues [i], 1);
2174 (byte) arabicLetterPrimaryValues [i];
2175 AddLetterMap ((char) i, 0x13, 0);
2177 fillIndex [0x13] = 0x84;
2178 for (int i = 0x0674; i < 0x06D6; i++)
2179 if (Char.IsLetter ((char) i))
2180 AddLetterMap ((char) i, 0x13, 1);
2183 // FIXME: it does seem straight codepoint mapping.
2184 fillIndex [0x14] = 04;
2185 for (int i = 0x0901; i < 0x0905; i++)
2186 if (!IsIgnorable (i))
2187 AddLetterMap ((char) i, 0x14, 2);
2188 fillIndex [0x14] = 0xB;
2189 for (int i = 0x0905; i < 0x093A; i++) {
2191 AddCharMap ('\u0929', 0x14, 0, 8);
2193 AddCharMap ('\u0931', 0x14, 0, 8);
2195 AddCharMap ('\u0934', 0x14, 0, 8);
2196 if (Char.IsLetter ((char) i))
2197 AddLetterMap ((char) i, 0x14, 4);
2199 AddCharMap ('\u0960', 0x14, 4);
2201 AddCharMap ('\u0961', 0x14, 4);
2203 fillIndex [0x14] = 0xDA;
2204 for (int i = 0x093E; i < 0x0945; i++)
2205 if (!IsIgnorable (i))
2206 AddLetterMap ((char) i, 0x14, 2);
2207 fillIndex [0x14] = 0xEC;
2208 for (int i = 0x0945; i < 0x094F; i++)
2209 if (!IsIgnorable (i))
2210 AddLetterMap ((char) i, 0x14, 2);
2214 fillIndex [0x15] = 02;
2215 for (int i = 0x0980; i < 0x9FF; i++) {
2216 if (IsIgnorable (i))
2219 fillIndex [0x15] = 0x3B;
2220 switch (Char.GetUnicodeCategory ((char) i)) {
2221 case UnicodeCategory.NonSpacingMark:
2222 case UnicodeCategory.DecimalDigitNumber:
2223 case UnicodeCategory.OtherNumber:
2226 AddLetterMap ((char) i, 0x15, 1);
2229 fillIndex [0x1] = 0x3;
2230 for (int i = 0x0981; i < 0x0A00; i++)
2231 if (Char.GetUnicodeCategory ((char) i) ==
2232 UnicodeCategory.NonSpacingMark)
2233 AddCharMap ((char) i, 0x1, 1);
2235 // Gurmukhi. orderedGurmukhi is from UCA
2236 // FIXME: it does not look equivalent to UCA.
2237 fillIndex [0x16] = 04;
2238 fillIndex [0x1] = 3;
2239 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2240 char c = orderedGurmukhi [i];
2241 if (IsIgnorable ((int) c))
2243 if (IsIgnorableNonSpacing (c)) {
2244 AddLetterMap (c, 0x1, 1);
2247 if (c == '\u0A3C' || c == '\u0A4D' ||
2248 '\u0A66' <= c && c <= '\u0A71')
2250 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2252 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2254 AddLetterMap (c, 0x16, shift);
2257 // Gujarati. orderedGujarati is from UCA
2258 fillIndex [0x17] = 0x4;
2260 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2261 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2262 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2263 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2264 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2265 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2266 // letters go first.
2267 for (int i = 0; i < orderedGujarati.Length; i++) {
2269 char c = orderedGujarati [i];
2270 if (Char.IsLetter (c)) {
2272 if (c == '\u0AB3' || c == '\u0A32')
2274 if (c == '\u0A33') {
2275 AddCharMap ('\u0A32', 0x17, 0);
2276 AddCharMap ('\u0A33', 0x17, 4, 4);
2280 AddCharMap ('\u0AE0', 0x17, 0, 5);
2281 AddCharMap (c, 0x17, 4);
2284 AddCharMap ('\u0AB3', 0x17, 6);
2288 byte gujaratiShift = 4;
2289 fillIndex [0x17] = 0xC0;
2290 for (int i = 0; i < orderedGujarati.Length; i++) {
2291 char c = orderedGujarati [i];
2292 if (fillIndex [0x17] == 0xCC)
2294 if (!Char.IsLetter (c)) {
2297 AddCharMap ('\u0A81', 0x17, 2);
2300 AddLetterMap (c, 0x17, gujaratiShift);
2305 fillIndex [0x1] = 03;
2306 fillIndex [0x18] = 02;
2307 for (int i = 0x0B00; i < 0x0B7F; i++) {
2308 switch (Char.GetUnicodeCategory ((char) i)) {
2309 case UnicodeCategory.NonSpacingMark:
2310 case UnicodeCategory.DecimalDigitNumber:
2311 AddLetterMap ((char) i, 0x1, 1);
2314 AddLetterMap ((char) i, 0x18, 1);
2318 fillIndex [0x19] = 2;
2319 AddCharMap ('\u0BD7', 0x19, 0);
2320 fillIndex [0x19] = 0xA;
2322 for (int i = 0x0B82; i <= 0x0B94; i++)
2323 if (!IsIgnorable ((char) i))
2324 AddCharMap ((char) i, 0x19, 2);
2326 fillIndex [0x19] = 0x28;
2327 // The array for Tamil consonants is a constant.
2328 // Windows have almost similar sequence to TAM from
2329 // tamilnet but a bit different in Grantha.
2330 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2331 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2333 fillIndex [0x19] = 0x82;
2334 for (int i = 0x0BBE; i < 0x0BCD; i++)
2335 if (Char.GetUnicodeCategory ((char) i) ==
2336 UnicodeCategory.SpacingCombiningMark
2338 AddLetterMap ((char) i, 0x19, 2);
2341 fillIndex [0x1A] = 0x4;
2342 for (int i = 0x0C00; i < 0x0C62; i++) {
2343 if (i == 0x0C55 || i == 0x0C56)
2345 AddCharMap ((char) i, 0x1A, 3);
2346 char supp = (i == 0x0C0B) ? '\u0C60':
2347 i == 0x0C0C ? '\u0C61' : char.MinValue;
2348 if (supp == char.MinValue)
2350 AddCharMap (supp, 0x1A, 3);
2354 fillIndex [0x1B] = 4;
2355 for (int i = 0x0C80; i < 0x0CE5; i++) {
2356 if (i == 0x0CD5 || i == 0x0CD6)
2358 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2359 continue; // shift after 0xCB9
2360 AddCharMap ((char) i, 0x1B, 3);
2362 // SPECIAL CASES: but why?
2363 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2364 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2365 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2368 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2372 fillIndex [0x1C] = 2;
2373 for (int i = 0x0D02; i < 0x0D61; i++)
2374 // FIXME: I avoided MSCompatUnicodeTable usage
2375 // here (it results in recursion). So check if
2376 // using NonSpacingMark makes sense or not.
2377 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2378 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2379 AddCharMap ((char) i, 0x1C, 1);
2381 // Thai ... note that it breaks 0x1E wall after E2B!
2382 // Also, all Thai characters have level 2 value 3.
2383 fillIndex [0x1E] = 2;
2384 for (int i = 0xE40; i <= 0xE44; i++)
2385 AddCharMap ((char) i, 0x1E, 1, 3);
2386 for (int i = 0xE01; i < 0xE2B; i++)
2387 AddCharMap ((char) i, 0x1E, 6, 3);
2388 fillIndex [0x1F] = 5;
2389 for (int i = 0xE2B; i < 0xE30; i++)
2390 AddCharMap ((char) i, 0x1F, 6, 3);
2391 fillIndex [0x1F] = 0x1E;
2392 for (int i = 0xE30; i < 0xE3B; i++)
2393 AddCharMap ((char) i, 0x1F, 1, 3);
2394 // some Thai characters remains.
2395 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2396 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2397 foreach (char c in specialThai)
2398 AddCharMap (c, 0x1F, 1);
2401 fillIndex [0x1F] = 2;
2402 for (int i = 0xE80; i < 0xEDF; i++)
2403 if (Char.IsLetter ((char) i))
2404 AddCharMap ((char) i, 0x1F, 1);
2406 // Georgian. orderedGeorgian is from UCA DUCET.
2407 fillIndex [0x21] = 5;
2408 for (int i = 0; i < orderedGeorgian.Length; i++) {
2409 char c = orderedGeorgian [i];
2410 if (map [(int) c].Defined)
2412 AddCharMap (c, 0x21, 0);
2414 AddCharMap ((char) (c - 0x30), 0x21, 0);
2415 fillIndex [0x21] += 5;
2419 fillIndex [0x22] = 2;
2420 int kanaOffset = 0x3041;
2421 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2423 for (int gyo = 0; gyo < 9; gyo++) {
2424 for (int dan = 0; dan < 5; dan++) {
2425 if (gyo == 7 && dan % 2 == 1) {
2428 kanaOffset -= 2; // There is no space for yi and ye.
2431 int cp = kanaOffset + dan * kanaLines [gyo];
2432 // small lines (a-gyo, ya-gyo)
2433 if (gyo == 0 || gyo == 7) {
2434 AddKanaMap (cp, 1); // small
2435 AddKanaMap (cp + 1, 1);
2438 AddKanaMap (cp, kanaLines [gyo]);
2442 // add small 'ka' (before normal one)
2443 AddKanaMap (0x30F5, 1);
2447 // add small 'ke' (before normal one)
2448 AddKanaMap (0x30F6, 1);
2452 // add small 'Tsu' (before normal one)
2453 AddKanaMap (0x3063, 1);
2457 fillIndex [0x22] += 3;
2458 kanaOffset += 5 * kanaLines [gyo];
2461 // Wa-gyo is almost special, so I just manually add.
2462 AddLetterMap ((char) 0x308E, 0x22, 0);
2463 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2464 AddLetterMap ((char) 0x308F, 0x22, 0);
2465 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2467 AddLetterMap ((char) 0x3090, 0x22, 0);
2468 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2469 fillIndex [0x22] += 2;
2470 // no "Wu" in Japanese.
2471 AddLetterMap ((char) 0x3091, 0x22, 0);
2472 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2474 AddLetterMap ((char) 0x3092, 0x22, 0);
2475 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2477 fillIndex [0x22] = 0x80;
2478 AddLetterMap ((char) 0x3093, 0x22, 0);
2479 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2481 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2482 map [0x30A6].Level1, 3);// voiced hiragana U
2483 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2484 map [0x30A6].Level1, 3);// voiced katakana U
2486 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2487 map [0x30AB].Level1, 0);// small katakana Ka
2488 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2489 map [0x30B1].Level1, 0);// small katakana Ke
2491 for (int i = 0x30F7; i < 0x30FB; i++)
2492 map [i] = new CharMapEntry (map [i - 8].Category,
2496 // JIS Japanese square chars.
2497 fillIndex [0x22] = 0x97;
2498 jisJapanese.Sort (JISComparer.Instance);
2499 foreach (JISCharacter j in jisJapanese)
2500 if (0x3300 <= j.CP && j.CP <= 0x3357)
2501 AddCharMap ((char) j.CP, 0x22, 1);
2502 // non-JIS Japanese square chars.
2503 nonJisJapanese.Sort (NonJISComparer.Instance);
2504 foreach (NonJISCharacter j in nonJisJapanese)
2505 AddCharMap ((char) j.CP, 0x22, 1);
2508 fillIndex [0x23] = 0x02;
2509 for (int i = 0x3105; i <= 0x312C; i++)
2510 AddCharMap ((char) i, 0x23, 1);
2512 // Estrangela: ancient Syriac
2513 fillIndex [0x24] = 0x0B;
2514 // FIXME: is 0x71E really alternative form?
2515 ArrayList syriacAlternatives = new ArrayList (
2516 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2517 for (int i = 0x0710; i <= 0x072C; i++) {
2518 if (i == 0x0711) // NonSpacingMark
2520 if (syriacAlternatives.Contains (i))
2522 AddCharMap ((char) i, 0x24, 4);
2527 foreach (int cp in syriacAlternatives)
2528 map [cp] = new CharMapEntry (0x24,
2529 (byte) (map [cp - 1].Level1 + 2),
2531 // FIXME: Syriac NonSpacingMark should go here.
2534 // FIXME: it turned out that it does not look like UCA
2535 fillIndex [0x24] = 0x6E;
2536 for (int i = 0; i < orderedThaana.Length; i++) {
2537 char c = orderedThaana [i];
2538 if (IsIgnorableNonSpacing ((int) c))
2540 AddCharMap (c, 0x24, 2);
2541 if (c == '\u0782') // SPECIAL CASE: why?
2542 fillIndex [0x24] += 2;
2546 // FIXME: Add more culture-specific letters (that are
2547 // not supported in Windows collation) here.
2549 // Surrogate ... they are computed.
2554 // Unlike UCA Windows Hangul sequence mixes Jongseong
2555 // with Choseong sequence as well as Jungseong,
2556 // adjusted to have the same primary weight for the
2557 // same base character. So it is impossible to compute
2560 // Here I introduce an ordered sequence of mixed
2561 // 'commands' and 'characters' that is similar to
2563 // - ',' increases primary weight.
2564 // - [A B] means a range, increasing index
2565 // - {A B} means a range, without increasing index
2566 // - '=' is no operation (it means the characters
2567 // of both sides have the same weight).
2568 // - '>' inserts a Hangul Syllable block that
2569 // contains 0x251 characters.
2570 // - '<' decreases the index
2571 // - '0'-'9' means skip count
2572 // - whitespaces are ignored
2575 string hangulSequence =
2576 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2577 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2578 + "<{\u1113 \u1116}, \u3165,"
2579 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2580 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2581 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2582 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2583 + "[\u11D1 \u11D2], \u11B2,"
2584 + "[\u11D3 \u11D5], \u11B3,"
2585 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2586 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2587 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2588 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2589 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2590 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2591 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2592 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2593 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2594 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2595 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2596 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2597 + "\u11F1,, \u11F2,,,"
2598 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2599 + "<\u114D, \u110D,, >"
2600 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2601 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2602 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2603 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2604 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2608 byte hangulCat = 0x52;
2609 fillIndex [hangulCat] = 0x2;
2611 int syllableBlock = 0;
2612 for (int n = 0; n < hangulSequence.Length; n++) {
2613 char c = hangulSequence [n];
2615 if (Char.IsWhiteSpace (c))
2621 IncrementSequentialIndex (ref hangulCat);
2624 if (fillIndex [hangulCat] == 2)
2625 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2626 fillIndex [hangulCat]--;
2629 IncrementSequentialIndex (ref hangulCat);
2630 for (int l = 0; l < 0x15; l++)
2631 for (int v = 0; v < 0x1C; v++) {
2633 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2634 IncrementSequentialIndex (ref hangulCat);
2639 start = hangulSequence [n + 1];
2640 end = hangulSequence [n + 3];
2641 for (int i = start; i <= end; i++) {
2642 AddCharMap ((char) i, hangulCat, 0);
2644 IncrementSequentialIndex (ref hangulCat);
2646 n += 4; // consumes 5 characters for this operation
2649 start = hangulSequence [n + 1];
2650 end = hangulSequence [n + 3];
2651 for (int i = start; i <= end; i++)
2652 AddCharMap ((char) i, hangulCat, 0);
2653 n += 4; // consumes 5 characters for this operation
2656 AddCharMap (c, hangulCat, 0);
2662 for (int i = 0x3200; i < 0x3300; i++) {
2663 if (IsIgnorable (i) || map [i].Defined)
2667 if (decompLength [i] == 4 &&
2668 decompValues [decompIndex [i]] == '(')
2669 ch = decompIndex [i] + 1;
2671 else if (decompLength [i] == 2 &&
2672 decompValues [decompIndex [i] + 1] == '\u1161')
2673 ch = decompIndex [i];
2674 else if (decompLength [i] == 1)
2675 ch = decompIndex [i];
2678 ch = decompValues [ch];
2679 if (ch < 0x1100 || 0x1200 < ch &&
2680 ch < 0xAC00 || 0xD800 < ch)
2684 int offset = i < 0x3260 ? 1 : 0;
2685 if (0x326E <= i && i <= 0x3273)
2688 map [i] = new CharMapEntry (map [ch].Category,
2689 (byte) (map [ch].Level1 + offset),
2691 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2697 // Letterlike characters and CJK compatibility square
2698 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2699 int [] counts = new int ['Z' - 'A' + 1];
2700 char [] namedChars = new char [sortableCharNames.Count];
2702 foreach (DictionaryEntry de in sortableCharNames) {
2703 counts [((string) de.Value) [0] - 'A']++;
2704 namedChars [nCharNames++] = (char) ((int) de.Key);
2706 nCharNames = 0; // reset
2707 for (int a = 0; a < counts.Length; a++) {
2708 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2709 for (int i = 0; i < counts [a]; i++)
2710 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2711 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2714 // CJK unified ideograph.
2716 fillIndex [cjkCat] = 0x2;
2717 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2718 if (!IsIgnorable (cp))
2719 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2720 // CJK Extensions goes here.
2721 // LAMESPEC: With this Windows style CJK layout, it is
2722 // impossible to add more CJK ideograph i.e. 0x9FA6-
2723 // 0x9FBB can never be added w/o breaking compat.
2724 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2725 if (!IsIgnorable (cp))
2726 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2728 // PrivateUse ... computed.
2729 // remaining Surrogate ... computed.
2731 #region Special "biggest" area (FF FF)
2732 fillIndex [0xFF] = 0xFF;
2733 char [] specialBiggest = new char [] {
2734 '\u3005', '\u3031', '\u3032', '\u309D',
2735 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2736 '\uFE7C', '\uFE7D', '\uFF70'};
2737 foreach (char c in specialBiggest)
2738 AddCharMap (c, 0xFF, 0);
2741 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2742 // non-alphanumeric ASCII except for: + - < = > '
2743 for (int i = 0x21; i < 0x7F; i++) {
2744 if (Char.IsLetterOrDigit ((char) i)
2745 || "+-<=>'".IndexOf ((char) i) >= 0)
2746 continue; // they are not added here.
2747 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2748 // Insert 3001 after ',' and 3002 after '.'
2750 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2752 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2754 AddCharMap ('\uFE30', 0x7, 1, 0);
2758 #region 07 - Punctuations and something else
2759 for (int i = 0xA0; i < char.MaxValue; i++) {
2760 if (IsIgnorable (i))
2763 // FIXME: actually those reset should not be
2764 // done but here I put for easy goal.
2766 fillIndex [0x7] = 0xE2;
2768 fillIndex [0x7] = 0x77;
2780 switch (Char.GetUnicodeCategory ((char) i)) {
2781 case UnicodeCategory.OtherPunctuation:
2782 case UnicodeCategory.ClosePunctuation:
2783 case UnicodeCategory.OpenPunctuation:
2784 case UnicodeCategory.InitialQuotePunctuation:
2785 case UnicodeCategory.FinalQuotePunctuation:
2786 case UnicodeCategory.ModifierSymbol:
2787 // SPECIAL CASES: // 0xA
2788 if (0x2020 <= i && i <= 0x2031)
2790 AddCharMapGroup ((char) i, 0x7, 1, 0);
2793 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2794 goto case UnicodeCategory.OtherPunctuation;
2799 // FIXME: it should not need to reset level 1, but
2800 // it's for easy goal.
2801 fillIndex [0x7] = 0xB6;
2802 for (int i = 0x2400; i <= 0x2421; i++)
2803 AddCharMap ((char) i, 0x7, 1, 0);
2806 // FIXME: for 07 xx we need more love.
2808 // Characters w/ diacritical marks (NFKD)
2809 for (int i = 0; i <= char.MaxValue; i++) {
2810 if (map [i].Defined || IsIgnorable (i))
2812 if (decompIndex [i] == 0)
2815 int start = decompIndex [i];
2816 int primaryChar = decompValues [start];
2819 int length = decompLength [i];
2820 // special processing for parenthesized ones.
2822 decompValues [start] == '(' &&
2823 decompValues [start + 2] == ')') {
2824 primaryChar = decompValues [start + 1];
2828 if (map [primaryChar].Level1 == 0)
2831 for (int l = 1; l < length; l++) {
2832 int c = decompValues [start + l];
2833 if (map [c].Level1 != 0)
2835 secondary += diacritical [c];
2839 map [i] = new CharMapEntry (
2840 map [primaryChar].Category,
2841 map [primaryChar].Level1,
2846 // category 08 - symbols
2847 fillIndex [0x8] = 2;
2848 // Here Windows mapping is not straightforward. It is
2849 // not based on computation but seems manual sorting.
2850 AddCharMapGroup ('+', 0x8, 1, 0); // plus
2851 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2852 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2853 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2854 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2855 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2856 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2857 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2858 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2859 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2860 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2861 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2862 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2864 for (int cp = 0; cp < 0x2300; cp++) {
2865 if (cp == 0xAC) // SPECIAL CASE: skip
2868 cp = 0x2200; // skip to 2200
2869 fillIndex [0x8] = 0x21;
2872 fillIndex [0x8] = 0x3;
2874 fillIndex [0x8] = 0xB9;
2875 if (!map [cp].Defined &&
2876 // Char.GetUnicodeCategory ((char) cp) ==
2877 // UnicodeCategory.MathSymbol)
2878 Char.IsSymbol ((char) cp))
2879 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2880 // SPECIAL CASES: no idea why Windows sorts as such
2883 AddCharMap ('\u227B', 0x8, 1, 0);
2884 AddCharMap ('\u22B1', 0x8, 1, 0);
2887 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2888 AddCharMapGroup ('\u226A', 0x8, 1, 0);
2889 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2890 AddCharMapGroup ('\u226B', 0x8, 1, 0);
2893 AddCharMap ('\u01C0', 0x8, 1, 0);
2894 AddCharMap ('\u01C1', 0x8, 1, 0);
2895 AddCharMap ('\u01C2', 0x8, 1, 0);
2900 #region Level2 adjustment
2902 diacritical [0x624] = 0x5;
2903 diacritical [0x626] = 0x7;
2904 diacritical [0x622] = 0x9;
2905 diacritical [0x623] = 0xA;
2906 diacritical [0x625] = 0xB;
2907 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2908 diacritical [0x64A] = 0x7; // Yaa'
2910 for (int i = 0; i < char.MaxValue; i++) {
2912 byte cat = map [i].Category;
2914 case 0xE: // Latin diacritics
2915 case 0x22: // Japanese: circled characters
2916 mod = diacritical [i];
2918 case 0x13: // Arabic
2919 if (diacritical [i] == 0 && i >= 0xFE8D)
2920 mod = 0x8; // default for arabic
2923 if (0x52 <= cat && cat <= 0x7F) // Hangul
2924 mod = diacritical [i];
2926 map [i] = new CharMapEntry (
2927 cat, map [i].Level1, mod);
2931 // FIXME: this is hack but those NonSpacingMark
2932 // characters and still undefined are likely to
2934 for (int i = 0; i < char.MaxValue; i++)
2935 if (!map [i].Defined &&
2937 Char.GetUnicodeCategory ((char) i) ==
2938 UnicodeCategory.NonSpacingMark)
2939 AddCharMap ((char) i, 1, 1);
2941 // FIXME: this is hack but those Symbol characters
2942 // are likely to fall into 0xA category.
2943 for (int i = 0; i < char.MaxValue; i++)
2944 if (!map [i].Defined &&
2946 Char.IsSymbol ((char) i))
2947 AddCharMap ((char) i, 0xA, 1);
2950 private void IncrementSequentialIndex (ref byte hangulCat)
2952 fillIndex [hangulCat]++;
2953 if (fillIndex [hangulCat] == 0) { // overflown
2955 fillIndex [hangulCat] = 0x2;
2959 // Reset fillIndex to fixed value and call AddLetterMap().
2960 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2962 fillIndex [category] = alphaWeight;
2963 AddLetterMap (c, category, 0);
2965 ArrayList al = latinMap [c] as ArrayList;
2969 foreach (int cp in al)
2970 AddLetterMap ((char) cp, category, 0);
2973 private void AddKanaMap (int i, byte voices)
2975 for (byte b = 0; b < voices; b++) {
2976 char c = (char) (i + b);
2977 byte arg = (byte) (b > 0 ? b + 2 : 0);
2979 AddLetterMapCore (c, 0x22, 0, arg);
2981 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2985 private void AddLetterMap (char c, byte category, byte updateCount)
2987 AddLetterMapCore (c, category, updateCount, 0);
2990 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2993 // <small> updates index
2994 c2 = ToSmallForm (c);
2996 AddCharMapGroup (c2, category, updateCount, level2);
2997 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2998 if (c2 != c && !map [(int) c2].Defined)
2999 AddLetterMapCore (c2, category, 0, level2);
3000 bool doUpdate = true;
3001 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3004 AddCharMapGroup (c, category, 0, level2);
3006 fillIndex [category] += updateCount;
3009 private bool AddCharMap (char c, byte category, byte increment)
3011 return AddCharMap (c, category, increment, 0);
3014 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3016 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3017 return false; // do nothing
3018 map [(int) c] = new CharMapEntry (category,
3019 category == 1 ? alt : fillIndex [category],
3020 category == 1 ? fillIndex [category] : alt);
3021 fillIndex [category] += increment;
3025 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3027 char c2 = ToSmallFormTail (c);
3029 AddCharMap (c2, category, updateCount, 0);
3031 AddCharMap (c, category, updateCount, 0);
3033 c2 = ToFullWidthTail (c);
3035 AddCharMapGroupTail (c2, category, updateCount);
3039 // Adds characters to table in the order below
3040 // (+ increases weight):
3044 // <full> | <super> | <sub>
3045 // <circle> | <wide> (| <narrow>)
3049 // level2 is fixed (does not increase).
3050 int [] sameWeightItems = new int [] {
3051 DecompositionFraction,
3055 DecompositionCircle,
3057 DecompositionNarrow,
3059 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3061 if (map [(int) c].Defined)
3064 char small = char.MinValue;
3065 char vertical = char.MinValue;
3066 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3068 object smv = nfkd [(byte) DecompositionSmall];
3070 small = (char) ((int) smv);
3071 object vv = nfkd [(byte) DecompositionVertical];
3073 vertical = (char) ((int) vv);
3076 // <small> updates index
3077 if (small != char.MinValue)
3078 AddCharMap (small, category, updateCount);
3081 AddCharMap (c, category, 0, level2);
3084 foreach (int weight in sameWeightItems) {
3085 object wv = nfkd [(byte) weight];
3087 AddCharMap ((char) ((int) wv), category, 0, level2);
3091 // update index here.
3092 fillIndex [category] += updateCount;
3094 if (vertical != char.MinValue)
3095 AddCharMap (vertical, category, updateCount, level2);
3098 private void AddCharMapCJK (char c, ref byte category)
3100 AddCharMap (c, category, 0, 0);
3101 IncrementSequentialIndex (ref category);
3103 // Special. I wonder why but Windows skips 9E F9.
3104 if (category == 0x9E && fillIndex [category] == 0xF9)
3105 IncrementSequentialIndex (ref category);
3108 private void AddCharMapGroupCJK (char c, ref byte category)
3110 AddCharMapCJK (c, ref category);
3112 // LAMESPEC: see below.
3113 if (c == '\u5B78') {
3114 AddCharMapCJK ('\u32AB', ref category);
3115 AddCharMapCJK ('\u323B', ref category);
3117 if (c == '\u52DE') {
3118 AddCharMapCJK ('\u3298', ref category);
3119 AddCharMapCJK ('\u3238', ref category);
3122 AddCharMapCJK ('\u32A2', ref category);
3124 // Especially this mapping order totally does
3125 // not make sense to me.
3126 AddCharMapCJK ('\u32A9', ref category);
3128 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3131 for (byte weight = 0; weight <= 0x12; weight++) {
3132 object wv = nfkd [weight];
3137 // Special: they are ignored in this area.
3138 // FIXME: check if it is sane
3139 if (0xF900 <= w && w <= 0xFAD9)
3141 // LAMESPEC: on Windows some of CJK characters
3142 // in 3200-32B0 are incorrectly mapped. They
3143 // mix Chinise and Japanese Kanji when
3144 // ordering those characters.
3146 case 0x32A2: case 0x3298: case 0x3238:
3147 case 0x32A9: case 0x323B: case 0x32AB:
3151 AddCharMapCJK ((char) w, ref category);
3155 // For now it is only for 0x7 category.
3156 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3158 char small = char.MinValue;
3159 char vertical = char.MinValue;
3160 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3162 object smv = nfkd [(byte) DecompositionSmall];
3164 small = (char) ((int) smv);
3165 object vv = nfkd [(byte) DecompositionVertical];
3167 vertical = (char) ((int) vv);
3170 // <small> updates index
3171 if (small != char.MinValue)
3172 // SPECIAL CASE excluded (FIXME: why?)
3173 if (small != '\u2024')
3174 AddCharMap (small, category, updateCount);
3177 AddCharMap (c, category, updateCount, level2);
3179 // Since nfkdMap is problematic to have two or more
3180 // NFKD to an identical character, here I iterate all.
3181 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3182 if (decompLength [c2] == 1 &&
3183 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3184 switch (decompType [c2]) {
3185 case DecompositionCompat:
3186 AddCharMap ((char) c2, category, updateCount, level2);
3192 if (vertical != char.MinValue)
3193 // SPECIAL CASE excluded (FIXME: why?)
3194 if (vertical != '\uFE33' && vertical != '\uFE34')
3195 AddCharMap (vertical, category, updateCount, level2);
3198 private void AddArabicCharMap (char c)
3201 byte updateCount = 1;
3205 AddCharMap (c, category, 0, level2);
3207 // Since nfkdMap is problematic to have two or more
3208 // NFKD to an identical character, here I iterate all.
3209 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3210 if (decompLength [c2] == 0)
3212 int idx = decompIndex [c2] + decompLength [c2] - 1;
3213 if ((int) (decompValues [idx]) == (int) c)
3214 AddCharMap ((char) c2, category,
3217 fillIndex [category] += updateCount;
3220 char ToFullWidth (char c)
3222 return ToDecomposed (c, DecompositionFull, false);
3225 char ToFullWidthTail (char c)
3227 return ToDecomposed (c, DecompositionFull, true);
3230 char ToSmallForm (char c)
3232 return ToDecomposed (c, DecompositionSmall, false);
3235 char ToSmallFormTail (char c)
3237 return ToDecomposed (c, DecompositionSmall, true);
3240 char ToDecomposed (char c, byte d, bool tail)
3242 if (decompType [(int) c] != d)
3244 int idx = decompIndex [(int) c];
3246 idx += decompLength [(int) c] - 1;
3247 return (char) decompValues [idx];
3250 bool ExistsJIS (int cp)
3252 foreach (JISCharacter j in jisJapanese)
3260 #region Level 3 properties (Case/Width)
3262 private byte ComputeLevel3Weight (char c)
3264 byte b = ComputeLevel3WeightRaw (c);
3265 return b > 0 ? (byte) (b + 2) : b;
3268 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3271 if ('\u3192' <= c && c <= '\u319F')
3273 // Japanese reading marks
3274 if (c == '\u3001' || c == '\u3002')
3277 if ('\u11A8' <= c && c <= '\u11F9')
3279 if ('\uFFA0' <= c && c <= '\uFFDC')
3281 if ('\u3130' <= c && c <= '\u3164')
3283 if ('\u3165' <= c && c <= '\u318E')
3285 // Georgian Capital letters
3286 if ('\u10A0' <= c && c <= '\u10C5')
3289 if ('\u2776' <= c && c <= '\u277F')
3291 if ('\u2780' <= c && c <= '\u2789')
3293 if ('\u2776' <= c && c <= '\u2793')
3295 if ('\u2160' <= c && c <= '\u216F')
3297 if ('\u2181' <= c && c <= '\u2182')
3300 if ('\u2135' <= c && c <= '\u2138')
3302 if ('\uFE80' <= c && c < '\uFF00') {
3303 // 2(Isolated)/8(Final)/0x18(Medial)
3304 switch (decompType [(int) c]) {
3305 case DecompositionIsolated:
3307 case DecompositionFinal:
3309 case DecompositionMedial:
3314 // actually I dunno the reason why they have weights.
3337 switch (decompType [(int) c]) {
3338 case DecompositionWide: // <wide>
3339 case DecompositionSub: // <sub>
3340 case DecompositionSuper: // <super>
3341 ret |= decompType [(int) c];
3344 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3346 if (isUppercase [(int) c]) // DerivedCoreProperties
3356 static bool IsIgnorable (int i)
3358 if (unicodeAge [i] >= 3.1)
3360 switch (char.GetUnicodeCategory ((char) i)) {
3361 case UnicodeCategory.OtherNotAssigned:
3362 case UnicodeCategory.Format:
3369 // FIXME: In the future use DerivedAge.txt to examine character
3370 // versions and set those ones that have higher version than
3371 // 1.0 as ignorable.
3372 static bool IsIgnorable (int i)
3376 // I guess, those characters are added between
3377 // Unicode 1.0 (LCMapString) and Unicode 3.1
3378 // (UnicodeCategory), so they used to be
3379 // something like OtherNotAssigned as of Unicode 1.1.
3380 case 0x2df: case 0x387:
3381 case 0x3d7: case 0x3d8: case 0x3d9:
3382 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3383 case 0x400: case 0x40d: case 0x450: case 0x45d:
3384 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3385 case 0x653: case 0x654: case 0x655: case 0x66d:
3387 case 0x1e9b: case 0x202f: case 0x20ad:
3388 case 0x20ae: case 0x20af:
3389 case 0x20e2: case 0x20e3:
3390 case 0x2139: case 0x213a: case 0x2183:
3391 case 0x2425: case 0x2426: case 0x2619:
3392 case 0x2670: case 0x2671: case 0x3007:
3393 case 0x3190: case 0x3191:
3394 case 0xfffc: case 0xfffd:
3396 // exceptional characters filtered by the
3397 // following conditions. Originally those exceptional
3398 // ranges are incorrect (they should not be ignored)
3399 // and most of those characters are unfortunately in
3401 case 0x4d8: case 0x4d9:
3402 case 0x4e8: case 0x4e9:
3404 case 0x3036: case 0x303f:
3405 case 0x337b: case 0xfb1e:
3410 // The whole Sinhala characters.
3411 0x0D82 <= i && i <= 0x0DF4
3412 // The whole Tibetan characters.
3413 || 0x0F00 <= i && i <= 0x0FD1
3414 // The whole Myanmar characters.
3415 || 0x1000 <= i && i <= 0x1059
3416 // The whole Etiopic, Cherokee,
3417 // Canadian Syllablic, Ogham, Runic,
3418 // Tagalog, Hanunoo, Philippine,
3419 // Buhid, Tagbanwa, Khmer and Mongorian
3421 || 0x1200 <= i && i <= 0x1DFF
3422 // Greek extension characters.
3423 || 0x1F00 <= i && i <= 0x1FFF
3424 // The whole Braille characters.
3425 || 0x2800 <= i && i <= 0x28FF
3426 // CJK radical characters.
3427 || 0x2E80 <= i && i <= 0x2EF3
3428 // Kangxi radical characters.
3429 || 0x2F00 <= i && i <= 0x2FD5
3430 // Ideographic description characters.
3431 || 0x2FF0 <= i && i <= 0x2FFB
3432 // Bopomofo letter and final
3433 || 0x31A0 <= i && i <= 0x31B7
3434 // White square with quadrant characters.
3435 || 0x25F0 <= i && i <= 0x25F7
3436 // Ideographic telegraph symbols.
3437 || 0x32C0 <= i && i <= 0x32CB
3438 || 0x3358 <= i && i <= 0x3370
3439 || 0x33E0 <= i && i <= 0x33FF
3440 // The whole YI characters.
3441 || 0xA000 <= i && i <= 0xA48C
3442 || 0xA490 <= i && i <= 0xA4C6
3443 // American small ligatures
3444 || 0xFB13 <= i && i <= 0xFB17
3445 // hebrew, arabic, variation selector.
3446 || 0xFB1D <= i && i <= 0xFE2F
3447 // Arabic ligatures.
3448 || 0xFEF5 <= i && i <= 0xFEFC
3449 // FIXME: why are they excluded?
3450 || 0x01F6 <= i && i <= 0x01F9
3451 || 0x0218 <= i && i <= 0x0233
3452 || 0x02A9 <= i && i <= 0x02AD
3453 || 0x02EA <= i && i <= 0x02EE
3454 || 0x0349 <= i && i <= 0x036F
3455 || 0x0488 <= i && i <= 0x048F
3456 || 0x04D0 <= i && i <= 0x04FF
3457 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3458 || 0x06D6 <= i && i <= 0x06ED
3459 || 0x06FA <= i && i <= 0x06FE
3460 || 0x2048 <= i && i <= 0x204D
3461 || 0x20e4 <= i && i <= 0x20ea
3462 || 0x213C <= i && i <= 0x214B
3463 || 0x21EB <= i && i <= 0x21FF
3464 || 0x22F2 <= i && i <= 0x22FF
3465 || 0x237B <= i && i <= 0x239A
3466 || 0x239B <= i && i <= 0x23CF
3467 || 0x24EB <= i && i <= 0x24FF
3468 || 0x2596 <= i && i <= 0x259F
3469 || 0x25F8 <= i && i <= 0x25FF
3470 || 0x2672 <= i && i <= 0x2689
3471 || 0x2768 <= i && i <= 0x2775
3472 || 0x27d0 <= i && i <= 0x27ff
3473 || 0x2900 <= i && i <= 0x2aff
3474 || 0x3033 <= i && i <= 0x303F
3475 || 0x31F0 <= i && i <= 0x31FF
3476 || 0x3250 <= i && i <= 0x325F
3477 || 0x32B1 <= i && i <= 0x32BF
3478 || 0x3371 <= i && i <= 0x337B
3479 || 0xFA30 <= i && i <= 0xFA6A
3483 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3485 case UnicodeCategory.PrivateUse:
3486 case UnicodeCategory.Surrogate:
3488 // ignored by nature
3489 case UnicodeCategory.Format:
3490 case UnicodeCategory.OtherNotAssigned:
3497 // To check IsIgnorable sanity, try the driver below under MS.NET.
3500 public static void Main ()
3502 for (int i = 0; i <= char.MaxValue; i++)
3503 Dump (i, IsIgnorable (i));
3506 static void Dump (int i, bool ignore)
3508 switch (Char.GetUnicodeCategory ((char) i)) {
3509 case UnicodeCategory.PrivateUse:
3510 case UnicodeCategory.Surrogate:
3511 return; // check nothing
3515 string s2 = new string ((char) i, 10);
3516 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3517 if ((ret == 0) == ignore)
3519 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3522 #endregion // IsIgnorable
3524 #region IsIgnorableSymbol
3525 static bool IsIgnorableSymbol (int i)
3527 if (IsIgnorable (i))
3532 case 0x00b5: case 0x01C0: case 0x01C1:
3533 case 0x01C2: case 0x01C3: case 0x01F6:
3534 case 0x01F7: case 0x01F8: case 0x01F9:
3535 case 0x02D0: case 0x02EE: case 0x037A:
3536 case 0x03D7: case 0x03F3:
3537 case 0x0400: case 0x040d:
3538 case 0x0450: case 0x045d:
3539 case 0x048C: case 0x048D:
3540 case 0x048E: case 0x048F:
3541 case 0x0587: case 0x0640: case 0x06E5:
3542 case 0x06E6: case 0x06FA: case 0x06FB:
3543 case 0x06FC: case 0x093D: case 0x0950:
3544 case 0x1E9B: case 0x2139: case 0x3006:
3545 case 0x3033: case 0x3034: case 0x3035:
3546 case 0xFE7E: case 0xFE7F:
3548 case 0x16EE: case 0x16EF: case 0x16F0:
3550 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3551 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3552 case 0x3038: // HANGZHOU NUMERAL TEN
3553 case 0x3039: // HANGZHOU NUMERAL TWENTY
3554 case 0x303a: // HANGZHOU NUMERAL THIRTY
3560 case 0x02B9: case 0x02BA: case 0x02C2:
3561 case 0x02C3: case 0x02C4: case 0x02C5:
3562 case 0x02C8: case 0x02CC: case 0x02CD:
3563 case 0x02CE: case 0x02CF: case 0x02D2:
3564 case 0x02D3: case 0x02D4: case 0x02D5:
3565 case 0x02D6: case 0x02D7: case 0x02DE:
3566 case 0x02E5: case 0x02E6: case 0x02E7:
3567 case 0x02E8: case 0x02E9:
3568 case 0x309B: case 0x309C:
3570 case 0x055A: // American Apos
3571 case 0x05C0: // Hebrew Punct
3572 case 0x0E4F: // Thai FONGMAN
3573 case 0x0E5A: // Thai ANGKHANKHU
3574 case 0x0E5B: // Thai KHOMUT
3576 case 0x09F2: // Bengali Rupee Mark
3577 case 0x09F3: // Bengali Rupee Sign
3579 case 0x221e: // INF.
3588 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3590 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3591 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3596 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3598 case UnicodeCategory.Surrogate:
3599 return false; // inconsistent
3601 case UnicodeCategory.SpacingCombiningMark:
3602 case UnicodeCategory.EnclosingMark:
3603 case UnicodeCategory.NonSpacingMark:
3604 case UnicodeCategory.PrivateUse:
3606 if (0x064B <= i && i <= 0x0652) // Arabic
3610 case UnicodeCategory.Format:
3611 case UnicodeCategory.OtherNotAssigned:
3618 // latin in a circle
3619 0x249A <= i && i <= 0x24E9
3620 || 0x2100 <= i && i <= 0x2132
3622 || 0x3196 <= i && i <= 0x31A0
3624 || 0x3200 <= i && i <= 0x321C
3626 || 0x322A <= i && i <= 0x3243
3628 || 0x3260 <= i && i <= 0x32B0
3629 || 0x32D0 <= i && i <= 0x3357
3630 || 0x337B <= i && i <= 0x33DD
3632 use = !Char.IsLetterOrDigit ((char) i);
3636 // This "Digit" rule is mystery.
3637 // It filters some symbols out.
3638 if (Char.IsLetterOrDigit ((char) i))
3640 if (Char.IsNumber ((char) i))
3642 if (Char.IsControl ((char) i)
3643 || Char.IsSeparator ((char) i)
3644 || Char.IsPunctuation ((char) i))
3646 if (Char.IsSymbol ((char) i))
3649 // FIXME: should check more
3654 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3656 public static void Main ()
3658 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3659 for (int i = 0; i <= char.MaxValue; i++) {
3660 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3661 if (uc == UnicodeCategory.Surrogate)
3664 bool ret = IsIgnorableSymbol (i);
3666 string s1 = "TEST ";
3667 string s2 = "TEST " + (char) i;
3669 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3671 if (ret != (result == 0))
3672 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3673 ret ? "should not ignore" :
3682 static bool IsIgnorableNonSpacing (int i)
3684 if (IsIgnorable (i))
3688 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3689 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3690 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3692 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3693 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3694 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3695 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3696 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3697 case 0x0CCD: case 0x0E4E:
3701 if (0x02b9 <= i && i <= 0x02c5
3702 || 0x02cc <= i && i <= 0x02d7
3703 || 0x02e4 <= i && i <= 0x02ef
3704 || 0x20DD <= i && i <= 0x20E0
3708 if (0x064B <= i && i <= 0x00652
3709 || 0x0941 <= i && i <= 0x0948
3710 || 0x0AC1 <= i && i <= 0x0ACD
3711 || 0x0C3E <= i && i <= 0x0C4F
3712 || 0x0E31 <= i && i <= 0x0E3F
3716 return Char.GetUnicodeCategory ((char) i) ==
3717 UnicodeCategory.NonSpacingMark;
3720 // We can reuse IsIgnorableSymbol testcode
3721 // for IsIgnorableNonSpacing.
3727 public byte Category;
3729 public byte Level2; // It is always single byte.
3730 public bool Defined;
3732 public CharMapEntry (byte category, byte level1, byte level2)
3734 Category = category;
3743 public readonly int CP;
3744 public readonly int JIS;
3746 public JISCharacter (int cp, int cpJIS)
3753 class JISComparer : IComparer
3755 public static readonly JISComparer Instance =
3758 public int Compare (object o1, object o2)
3760 JISCharacter j1 = (JISCharacter) o1;
3761 JISCharacter j2 = (JISCharacter) o2;
3762 return j1.JIS - j2.JIS;
3766 class NonJISCharacter
3768 public readonly int CP;
3769 public readonly string Name;
3771 public NonJISCharacter (int cp, string name)
3778 class NonJISComparer : IComparer
3780 public static readonly NonJISComparer Instance =
3781 new NonJISComparer ();
3783 public int Compare (object o1, object o2)
3785 NonJISCharacter j1 = (NonJISCharacter) o1;
3786 NonJISCharacter j2 = (NonJISCharacter) o2;
3787 return string.CompareOrdinal (j1.Name, j2.Name);
3791 class DecimalDictionaryValueComparer : IComparer
3793 public static readonly DecimalDictionaryValueComparer Instance
3794 = new DecimalDictionaryValueComparer ();
3796 private DecimalDictionaryValueComparer ()
3800 public int Compare (object o1, object o2)
3802 DictionaryEntry e1 = (DictionaryEntry) o1;
3803 DictionaryEntry e2 = (DictionaryEntry) o2;
3804 // FIXME: in case of 0, compare decomposition categories
3805 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3808 int i1 = (int) e1.Key;
3809 int i2 = (int) e2.Key;
3814 class StringDictionaryValueComparer : IComparer
3816 public static readonly StringDictionaryValueComparer Instance
3817 = new StringDictionaryValueComparer ();
3819 private StringDictionaryValueComparer ()
3823 public int Compare (object o1, object o2)
3825 DictionaryEntry e1 = (DictionaryEntry) o1;
3826 DictionaryEntry e2 = (DictionaryEntry) o2;
3827 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3830 int i1 = (int) e1.Key;
3831 int i2 = (int) e2.Key;
3836 class UCAComparer : IComparer
3838 public static readonly UCAComparer Instance
3839 = new UCAComparer ();
3841 private UCAComparer ()
3845 public int Compare (object o1, object o2)
3847 char i1 = (char) o1;
3848 char i2 = (char) o2;
3850 int l1 = CollationElementTable.GetSortKeyCount (i1);
3851 int l2 = CollationElementTable.GetSortKeyCount (i2);
3852 int l = l1 > l2 ? l2 : l1;
3854 for (int i = 0; i < l; i++) {
3855 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3856 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3857 int v = k1.Primary - k2.Primary;
3860 v = k1.Secondary - k2.Secondary;
3863 v = k1.Thirtiary - k2.Thirtiary;
3866 v = k1.Quarternary - k2.Quarternary;
3879 ArrayList items = new ArrayList ();
3881 public Tailoring (int lcid)
3886 public Tailoring (int lcid, int alias)
3893 get { return lcid; }
3897 get { return alias; }
3900 public bool FrenchSort {
3901 get { return frenchSort; }
3902 set { frenchSort = value; }
3905 public void AddDiacriticalMap (byte target, byte replace)
3907 items.Add (new DiacriticalMap (target, replace));
3910 public void AddSortKeyMap (string source, byte [] sortkey)
3912 items.Add (new SortKeyMap (source, sortkey));
3915 public void AddReplacementMap (string source, string replace)
3917 items.Add (new ReplacementMap (source, replace));
3920 public char [] ItemToCharArray ()
3922 ArrayList al = new ArrayList ();
3923 foreach (ITailoringMap m in items)
3924 al.AddRange (m.ToCharArray ());
3925 return al.ToArray (typeof (char)) as char [];
3928 interface ITailoringMap
3930 char [] ToCharArray ();
3933 class DiacriticalMap : ITailoringMap
3935 public readonly byte Target;
3936 public readonly byte Replace;
3938 public DiacriticalMap (byte target, byte replace)
3944 public char [] ToCharArray ()
3946 char [] ret = new char [3];
3947 ret [0] = (char) 02; // kind:DiacriticalMap
3948 ret [1] = (char) Target;
3949 ret [2] = (char) Replace;
3954 class SortKeyMap : ITailoringMap
3956 public readonly string Source;
3957 public readonly byte [] SortKey;
3959 public SortKeyMap (string source, byte [] sortkey)
3965 public char [] ToCharArray ()
3967 char [] ret = new char [Source.Length + 7];
3968 ret [0] = (char) 01; // kind:SortKeyMap
3969 for (int i = 0; i < Source.Length; i++)
3970 ret [i + 1] = Source [i];
3972 for (int i = 0; i < 4; i++)
3973 ret [i + Source.Length + 2] = (char) SortKey [i];
3978 class ReplacementMap : ITailoringMap
3980 public readonly string Source;
3981 public readonly string Replace;
3983 public ReplacementMap (string source, string replace)
3989 public char [] ToCharArray ()
3991 char [] ret = new char [Source.Length + Replace.Length + 3];
3992 ret [0] = (char) 03; // kind:ReplaceMap
3994 for (int i = 0; i < Source.Length; i++)
3995 ret [pos++] = Source [i];
3998 for (int i = 0; i < Replace.Length; i++)
3999 ret [pos++] = Replace [i];