3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
102 "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
104 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
105 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
106 "WITH OGONEK;", "WITH CEDILLA;",
108 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
109 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
111 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
112 " DIAERESIS AND GRAVE;",
114 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
115 " MACRON AND ACUTE;",
116 " MACRON AND GRAVE;",
118 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
119 " RING ABOVE AND ACUTE",
120 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
121 " CIRCUMFLEX AND TILDE",
122 " TILDE AND DIAERESIS",
125 " CEDILLA AND BREVE",
126 " OGONEK AND MACRON",
129 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
133 " PRECEDED BY APOSTROPHE",
135 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
138 " RETROFLEX;", "DIAERESIS BELOW",
141 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
142 " BREVE BELOW;", " HORN AND GRAVE",
145 " DOT BELOW AND DOT ABOVE",
146 " RIGHT HALF RING", " HORN AND TILDE",
147 " CIRCUMFLEX AND DOT BELOW",
148 " BREVE AND DOT BELOW",
149 " DOT BELOW AND MACRON",
151 " HORN AND HOOK ABOVE",
153 // CIRCLED, PARENTHESIZED and so on
154 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
155 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
156 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
158 byte [] diacriticWeights = new byte [] {
162 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
163 0x17, 0x19, 0x1A, 0x1B, 0x1C,
165 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
166 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
168 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
169 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
171 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
172 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
174 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
175 0x69, 0x69, 0x6A, 0x6D, 0x6E,
177 // CIRCLED, PARENTHESIZED and so on.
178 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
182 int [] numberSecondaryWeightBounds = new int [] {
183 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
184 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
185 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
186 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
187 0xE50, 0xE60, 0xED0, 0xEE0
190 char [] orderedGurmukhi;
191 char [] orderedGujarati;
192 char [] orderedGeorgian;
193 char [] orderedThaana;
195 static readonly char [] orderedTamilConsonants = new char [] {
196 // based on traditional Tamil consonants, except for
197 // Grantha (where Microsoft breaks traditionalism).
198 // http://www.angelfire.com/empire/thamizh/padanGaL
199 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
200 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
201 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
202 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
205 // cp -> character name (only for some characters)
206 ArrayList sortableCharNames = new ArrayList ();
208 // cp -> arrow value (int)
209 ArrayList arrowValues = new ArrayList ();
211 // cp -> box value (int)
212 ArrayList boxValues = new ArrayList ();
214 // cp -> level1 value
215 Hashtable arabicLetterPrimaryValues = new Hashtable ();
218 Hashtable arabicNameMap = new Hashtable ();
220 // cp -> Hashtable [decompType] -> cp
221 Hashtable nfkdMap = new Hashtable ();
223 // Latin letter -> ArrayList [int]
224 Hashtable latinMap = new Hashtable ();
226 ArrayList jisJapanese = new ArrayList ();
227 ArrayList nonJisJapanese = new ArrayList ();
229 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
230 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
231 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
232 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
233 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
235 byte [] ignorableFlags = new byte [char.MaxValue + 1];
237 static double [] unicodeAge = new double [char.MaxValue + 1];
239 ArrayList tailorings = new ArrayList ();
241 void Run (string [] args)
243 string dirname = args.Length == 0 ? "downloaded" : args [0];
244 ParseSources (dirname);
245 Console.Error.WriteLine ("parse done.");
247 ModifyParsedValues ();
249 Console.Error.WriteLine ("generation done.");
251 Console.Error.WriteLine ("serialization done.");
253 StreamWriter sw = new StreamWriter ("agelog.txt");
254 for (int i = 0; i < char.MaxValue; i++) {
255 bool shouldBe = false;
256 switch (Char.GetUnicodeCategory ((char) i)) {
257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
258 shouldBe = true; break;
260 if (unicodeAge [i] >= 3.1)
262 //if (IsIgnorable (i) != shouldBe)
263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
269 byte [] CompressArray (byte [] source, CodePointIndexer i)
271 return (byte []) CodePointIndexer.CompressArray (
272 source, typeof (byte), i);
275 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
277 return (ushort []) CodePointIndexer.CompressArray (
278 source, typeof (ushort), i);
284 SerializeTailorings ();
286 byte [] categories = new byte [map.Length];
287 byte [] level1 = new byte [map.Length];
288 byte [] level2 = new byte [map.Length];
289 byte [] level3 = new byte [map.Length];
290 ushort [] widthCompat = new ushort [map.Length];
291 for (int i = 0; i < map.Length; i++) {
292 categories [i] = map [i].Category;
293 level1 [i] = map [i].Level1;
294 level2 [i] = map [i].Level2;
295 level3 [i] = ComputeLevel3Weight ((char) i);
296 // For Japanese Half-width characters, don't
297 // map widthCompat. It is IgnoreKanaType that
298 // handles those width differences.
299 if (0xFF6D <= i && i <= 0xFF9D)
301 switch (decompType [i]) {
302 case DecompositionNarrow:
303 case DecompositionWide:
304 case DecompositionSuper:
305 case DecompositionSub:
306 // they are always 1 char
307 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
313 ignorableFlags = CompressArray (ignorableFlags,
314 MSCompatUnicodeTableUtil.Ignorable);
315 categories = CompressArray (categories,
316 MSCompatUnicodeTableUtil.Category);
317 level1 = CompressArray (level1,
318 MSCompatUnicodeTableUtil.Level1);
319 level2 = CompressArray (level2,
320 MSCompatUnicodeTableUtil.Level2);
321 level3 = CompressArray (level3,
322 MSCompatUnicodeTableUtil.Level3);
323 widthCompat = (ushort []) CodePointIndexer.CompressArray (
324 widthCompat, typeof (ushort),
325 MSCompatUnicodeTableUtil.WidthCompat);
326 cjkCHS = CompressArray (cjkCHS,
327 MSCompatUnicodeTableUtil.CjkCHS);
328 cjkCHT = CompressArray (cjkCHT,
329 MSCompatUnicodeTableUtil.Cjk);
330 cjkJA = CompressArray (cjkJA,
331 MSCompatUnicodeTableUtil.Cjk);
332 cjkKO = CompressArray (cjkKO,
333 MSCompatUnicodeTableUtil.Cjk);
334 cjkKOlv2 = CompressArray (cjkKOlv2,
335 MSCompatUnicodeTableUtil.Cjk);
338 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
340 MemoryStream ms = new MemoryStream ();
341 BinaryWriter binary = new BinaryWriter (ms);
342 binary.Write (ignorableFlags.Length);
344 for (int i = 0; i < ignorableFlags.Length; i++) {
345 byte value = ignorableFlags [i];
347 Result.Write ("{0},", value);
349 Result.Write ("0x{0:X02},", value);
351 binary.Write (value);
353 if ((i & 0xF) == 0xF)
354 Result.WriteLine ("// {0:X04}", i - 0xF);
356 Result.WriteLine ("};");
360 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
362 binary.Write (categories.Length);
364 for (int i = 0; i < categories.Length; i++) {
365 byte value = categories [i];
367 Result.Write ("{0},", value);
369 Result.Write ("0x{0:X02},", value);
371 binary.Write (value);
373 if ((i & 0xF) == 0xF)
374 Result.WriteLine ("// {0:X04}", i - 0xF);
376 Result.WriteLine ("};");
379 // Primary weight value
380 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
382 binary.Write (level1.Length);
384 for (int i = 0; i < level1.Length; i++) {
385 byte value = level1 [i];
387 Result.Write ("{0},", value);
389 Result.Write ("0x{0:X02},", value);
391 binary.Write (value);
393 if ((i & 0xF) == 0xF)
394 Result.WriteLine ("// {0:X04}", i - 0xF);
396 Result.WriteLine ("};");
400 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
402 binary.Write (level2.Length);
404 for (int i = 0; i < level2.Length; i++) {
405 byte value = level2 [i];
407 Result.Write ("{0},", value);
409 Result.Write ("0x{0:X02},", value);
411 binary.Write (value);
413 if ((i & 0xF) == 0xF)
414 Result.WriteLine ("// {0:X04}", i - 0xF);
416 Result.WriteLine ("};");
420 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
422 binary.Write (level3.Length);
424 for (int i = 0; i < level3.Length; i++) {
425 byte value = level3 [i];
427 Result.Write ("{0},", value);
429 Result.Write ("0x{0:X02},", value);
431 binary.Write (value);
433 if ((i & 0xF) == 0xF)
434 Result.WriteLine ("// {0:X04}", i - 0xF);
436 Result.WriteLine ("};");
439 // Width insensitivity mappings
440 // (for now it is more lightweight than dumping the
441 // entire NFKD table).
442 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
444 binary.Write (widthCompat.Length);
446 for (int i = 0; i < widthCompat.Length; i++) {
447 ushort value = widthCompat [i];
449 Result.Write ("{0},", value);
451 Result.Write ("0x{0:X02},", value);
453 binary.Write (value);
455 if ((i & 0xF) == 0xF)
456 Result.WriteLine ("// {0:X04}", i - 0xF);
458 Result.WriteLine ("};");
461 using (FileStream fs = File.Create ("../collation.core.bin")) {
462 byte [] array = ms.ToArray ();
463 fs.Write (array, 0, array.Length);
468 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
469 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
470 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
471 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
472 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
475 void SerializeCJK (string name, ushort [] cjk, int max)
477 int offset = 0;//char.MaxValue - cjk.Length;
478 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
480 MemoryStream ms = new MemoryStream ();
481 BinaryWriter binary = new BinaryWriter (ms);
482 binary.Write (cjk.Length);
484 for (int i = 0; i < cjk.Length; i++) {
485 if (i + offset == max)
487 ushort value = cjk [i];
489 Result.Write ("{0},", value);
491 Result.Write ("0x{0:X04},", value);
493 binary.Write (value);
495 if ((i & 0xF) == 0xF)
496 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
498 Result.WriteLine ("};");
501 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
502 byte [] array = ms.ToArray ();
503 fs.Write (array, 0, array.Length);
508 void SerializeCJK (string name, byte [] cjk, int max)
510 int offset = 0;//char.MaxValue - cjk.Length;
511 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
513 MemoryStream ms = new MemoryStream ();
514 BinaryWriter binary = new BinaryWriter (ms);
516 for (int i = 0; i < cjk.Length; i++) {
517 if (i + offset == max)
519 byte value = cjk [i];
521 Result.Write ("{0},", value);
523 Result.Write ("0x{0:X02},", value);
525 binary.Write (value);
527 if ((i & 0xF) == 0xF)
528 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
530 Result.WriteLine ("};");
533 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
534 byte [] array = ms.ToArray ();
535 fs.Write (array, 0, array.Length);
540 void SerializeTailorings ()
542 Hashtable indexes = new Hashtable ();
543 Hashtable counts = new Hashtable ();
544 Result.WriteLine ("static char [] tailorings = new char [] {");
547 MemoryStream ms = new MemoryStream ();
548 BinaryWriter binary = new BinaryWriter (ms);
550 foreach (Tailoring t in tailorings) {
553 Result.Write ("/*{0}*/", t.LCID);
554 indexes.Add (t.LCID, count);
555 char [] values = t.ItemToCharArray ();
556 counts.Add (t.LCID, values.Length);
557 foreach (char c in values) {
558 Result.Write ("'\\x{0:X}', ", (int) c);
559 if (++count % 16 == 0)
560 Result.WriteLine (" // {0:X04}", count - 16);
562 binary.Write ((ushort) c);
566 Result.WriteLine ("};");
568 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
570 byte [] rawdata = ms.ToArray ();
571 ms = new MemoryStream ();
572 binary = new BinaryWriter (ms);
573 binary.Write (tailorings.Count);
575 foreach (Tailoring t in tailorings) {
576 int target = t.Alias != 0 ? t.Alias : t.LCID;
577 if (!indexes.ContainsKey (target)) {
578 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
581 int idx = (int) indexes [target];
582 int cnt = (int) counts [target];
583 bool french = t.FrenchSort;
585 foreach (Tailoring t2 in tailorings)
586 if (t2.LCID == t.LCID)
587 french = t2.FrenchSort;
588 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
590 binary.Write (t.LCID);
593 binary.Write (french);
596 Result.WriteLine ("};");
598 binary.Write ((byte) 0xFF);
599 binary.Write ((byte) 0xFF);
600 binary.Write (rawdata.Length / 2);
601 binary.Write (rawdata, 0, rawdata.Length);
604 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
605 byte [] array = ms.ToArray ();
606 fs.Write (array, 0, array.Length);
613 void ParseSources (string dirname)
616 dirname + "/UnicodeData.txt";
617 string derivedCoreProps =
618 dirname + "/DerivedCoreProperties.txt";
620 dirname + "/Scripts.txt";
622 dirname + "/CP932.TXT";
624 dirname + "/DerivedAge.txt";
625 string chXML = dirname + "/common/collation/zh.xml";
626 string jaXML = dirname + "/common/collation/ja.xml";
627 string koXML = dirname + "/common/collation/ko.xml";
629 ParseDerivedAge (derivedAge);
633 ParseJISOrder (cp932); // in prior to ParseUnidata()
634 ParseUnidata (unidata);
636 ParseDerivedCoreProperties (derivedCoreProps);
637 ParseScripts (scripts);
638 ParseCJK (chXML, jaXML, koXML);
640 ParseTailorings ("mono-tailoring-source.txt");
643 void ParseTailorings (string filename)
647 using (StreamReader sr = new StreamReader (filename)) {
649 while (sr.Peek () >= 0) {
651 ProcessTailoringLine (ref t,
652 sr.ReadLine ().Trim ());
654 } catch (Exception) {
655 Console.Error.WriteLine ("ERROR at line {0}", line);
661 // For now this is enough.
662 string ParseTailoringSourceValue (string s)
664 StringBuilder sb = new StringBuilder ();
665 for (int i = 0; i < s.Length; i++) {
666 if (s.StartsWith ("\\u")) {
667 sb.Append ((char) int.Parse (
668 s.Substring (2, 4), NumberStyles.HexNumber),
675 return sb.ToString ();
678 void ProcessTailoringLine (ref Tailoring t, string s)
680 int idx = s.IndexOf ('#');
682 s = s.Substring (0, idx).Trim ();
683 if (s.Length == 0 || s [0] == '#')
686 idx = s.IndexOf ('=');
689 int.Parse (s.Substring (1, idx - 1)),
690 int.Parse (s.Substring (idx + 1)));
692 t = new Tailoring (int.Parse (s.Substring (1)));
696 if (s.StartsWith ("*FrenchSort")) {
700 string d = "*Diacritical";
701 if (s.StartsWith (d)) {
702 idx = s.IndexOf ("->");
703 t.AddDiacriticalMap (
704 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
705 NumberStyles.HexNumber),
706 byte.Parse (s.Substring (idx + 2).Trim (),
707 NumberStyles.HexNumber));
710 idx = s.IndexOf (':');
712 string source = s.Substring (0, idx).Trim ();
713 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
714 byte [] b = new byte [4];
715 for (int i = 0; i < 4; i++) {
719 b [i] = byte.Parse (l [i],
720 NumberStyles.HexNumber);
722 t.AddSortKeyMap (ParseTailoringSourceValue (source),
725 idx = s.IndexOf ('=');
727 t.AddReplacementMap (
728 ParseTailoringSourceValue (
729 s.Substring (0, idx).Trim ()),
730 ParseTailoringSourceValue (
731 s.Substring (idx + 1).Trim ()));
734 void ParseDerivedAge (string filename)
736 using (StreamReader file =
737 new StreamReader (filename)) {
738 while (file.Peek () >= 0) {
739 string s = file.ReadLine ();
740 int idx = s.IndexOf ('#');
742 s = s.Substring (0, idx);
743 idx = s.IndexOf (';');
747 string cpspec = s.Substring (0, idx);
748 idx = cpspec.IndexOf ("..");
749 NumberStyles nf = NumberStyles.HexNumber |
750 NumberStyles.AllowTrailingWhite;
751 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
752 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
753 string value = s.Substring (cpspec.Length + 1).Trim ();
756 if (cp > char.MaxValue)
759 double v = double.Parse (value);
760 for (int i = cp; i <= cpEnd; i++)
764 unicodeAge [0] = double.MaxValue; // never be supported
767 void ParseUnidata (string filename)
769 ArrayList decompValues = new ArrayList ();
770 using (StreamReader unidata =
771 new StreamReader (filename)) {
772 for (int line = 1; unidata.Peek () >= 0; line++) {
774 ProcessUnidataLine (unidata.ReadLine (), decompValues);
775 } catch (Exception) {
776 Console.Error.WriteLine ("**** At line " + line);
781 this.decompValues = (int [])
782 decompValues.ToArray (typeof (int));
785 char previousLatinTarget = char.MinValue;
786 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
788 void ProcessUnidataLine (string s, ArrayList decompValues)
790 int idx = s.IndexOf ('#');
792 s = s.Substring (0, idx);
793 idx = s.IndexOf (';');
796 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
797 string [] values = s.Substring (idx + 1).Split (';');
800 if (cp > char.MaxValue)
802 if (IsIgnorable (cp))
805 string name = values [0];
807 // SPECIAL CASE: rename some characters for diacritical
808 // remapping. FIXME: why are they different?
809 // FIXME: it's still not working.
810 if (cp == 0x018B || cp == 0x018C)
811 name = name.Replace ("TOPBAR", "STROKE");
814 if (s.IndexOf ("SMALL CAPITAL") > 0)
815 isSmallCapital [cp] = true;
817 // latin mapping by character name
818 if (s.IndexOf ("LATIN") >= 0) {
819 int lidx = s.IndexOf ("LETTER DOTLESS ");
820 int offset = lidx + 15;
822 lidx = s.IndexOf ("LETTER TURNED ");
826 lidx = s.IndexOf ("LETTER CAPITAL ");
830 lidx = s.IndexOf ("LETTER SCRIPT ");
834 lidx = s.IndexOf ("LETTER ");
837 char c = lidx > 0 ? s [offset] : char.MinValue;
838 char n = s [offset + 1];
839 char target = char.MinValue;
840 if ('A' <= c && c <= 'Z' &&
841 (n == ' ') || n == ';') {
843 // FIXME: After 'Z', I cannot reset this state.
844 previousLatinTarget = c == 'Z' ? char.MinValue : c;
847 if (s.Substring (offset).StartsWith ("ALPHA"))
849 else if (s.Substring (offset).StartsWith ("TONE SIX"))
851 else if (s.Substring (offset).StartsWith ("OPEN O"))
853 else if (s.Substring (offset).StartsWith ("SCHWA"))
855 else if (s.Substring (offset).StartsWith ("ENG"))
857 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
859 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
861 else if (s.Substring (offset).StartsWith ("TONE TWO"))
863 else if (s.Substring (offset).StartsWith ("ESH"))
866 if (target == char.MinValue)
867 target = previousLatinTarget;
869 if (target != char.MinValue) {
870 ArrayList entry = (ArrayList) latinMap [target];
872 entry = new ArrayList ();
873 latinMap [target] = entry;
876 // FIXME: This secondary weight is hack.
877 // They are here because they must not
878 // be identical to the corresponding
880 if (c != target && diacritical [cp] == 0) {
881 diacriticalOffset [c - 'A']++;
882 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
888 if (0x2000 <= cp && cp < 0x3000) {
890 // SPECIAL CASES. FIXME: why?
892 case 0x21C5: value = -1; break; // E2
893 case 0x261D: value = 1; break;
894 case 0x27A6: value = 3; break;
895 case 0x21B0: value = 7; break;
896 case 0x21B1: value = 3; break;
897 case 0x21B2: value = 7; break;
898 case 0x21B4: value = 5; break;
899 case 0x21B5: value = 7; break;
900 case 0x21B9: value = -1; break; // E1
901 case 0x21CF: value = 7; break;
902 case 0x21D0: value = 3; break;
904 string [] arrowTargets = new string [] {
916 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
917 if (s.IndexOf (arrowTargets [i]) > 0 &&
918 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
919 s.IndexOf (" OVER") < 0
923 arrowValues.Add (new DictionaryEntry (
928 if (0x2500 <= cp && cp < 0x2600) {
931 // up:1 down:2 right:4 left:8 vert:16 horiz:32
934 // [dr] [dl] [ur] [ul]
938 ArrayList flags = new ArrayList (new int [] {
941 4 + 2, 8 + 2, 4 + 1, 8 + 1,
942 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
943 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
944 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
946 byte [] offsets = new byte [] {
953 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
955 if (s.IndexOf (" UP") >= 0)
957 if (s.IndexOf (" DOWN") >= 0)
959 if (s.IndexOf (" RIGHT") >= 0)
961 if (s.IndexOf (" LEFT") >= 0)
963 if (s.IndexOf (" VERTICAL") >= 0)
965 if (s.IndexOf (" HORIZONTAL") >= 0)
968 int fidx = flags.IndexOf (flag);
969 value = fidx < 0 ? fidx : offsets [fidx];
970 } else if (s.IndexOf ("BLOCK") >= 0) {
971 if (s.IndexOf ("ONE EIGHTH") >= 0)
973 else if (s.IndexOf ("ONE QUARTER") >= 0)
975 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
977 else if (s.IndexOf ("HALF") >= 0)
979 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
981 else if (s.IndexOf ("THREE QUARTERS") >= 0)
983 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
988 else if (s.IndexOf ("SHADE") >= 0)
990 else if (s.IndexOf ("SQUARE") >= 0)
992 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
994 else if (s.IndexOf ("RECTANGLE") >= 0)
996 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
998 else if (s.IndexOf ("TRIANGLE") >= 0) {
999 if (s.IndexOf ("UP-POINTING") >= 0)
1000 value = 0xC0 - 0xE5;
1001 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1002 value = 0xC1 - 0xE5;
1003 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1004 value = 0xC2 - 0xE5;
1005 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1006 value = 0xC3 - 0xE5;
1008 else if (s.IndexOf ("POINTER") >= 0) {
1009 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1010 value = 0xC4 - 0xE5;
1011 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1012 value = 0xC5 - 0xE5;
1014 else if (s.IndexOf ("DIAMOND") >= 0)
1015 value = 0xC6 - 0xE5;
1016 else if (s.IndexOf ("FISHEYE") >= 0)
1017 value = 0xC7 - 0xE5;
1018 else if (s.IndexOf ("LOZENGE") >= 0)
1019 value = 0xC8 - 0xE5;
1020 else if (s.IndexOf ("BULLSEYE") >= 0)
1021 value = 0xC9 - 0xE5;
1022 else if (s.IndexOf ("CIRCLE") >= 0) {
1023 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1024 value = 0xCA - 0xE5;
1025 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1026 value = 0xCB - 0xE5;
1028 value = 0xC9 - 0xE5;
1030 if (0x25DA <= cp && cp <= 0x25E5)
1031 value = 0xCD + cp - 0x25DA - 0xE5;
1033 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1035 case 0x2571: value = 0xF; break;
1036 case 0x2572: value = 0x10; break;
1037 case 0x2573: value = 0x11; break;
1040 boxValues.Add (new DictionaryEntry (
1044 // For some characters store the name and sort later
1045 // to determine sorting.
1046 if (0x2100 <= cp && cp <= 0x213F &&
1047 Char.IsSymbol ((char) cp))
1048 sortableCharNames.Add (
1049 new DictionaryEntry (cp, name));
1050 else if (0x3380 <= cp && cp <= 0x33DD)
1051 sortableCharNames.Add (new DictionaryEntry (
1052 cp, name.Substring (7)));
1054 if (Char.GetUnicodeCategory ((char) cp) ==
1055 UnicodeCategory.MathSymbol) {
1056 if (name.StartsWith ("CIRCLED "))
1057 diacritical [cp] = 0xEE;
1058 if (name.StartsWith ("SQUARED "))
1059 diacritical [cp] = 0xEF;
1062 // diacritical weights by character name
1063 if (diacritics.Length != diacriticWeights.Length)
1064 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1065 for (int d = 0; d < diacritics.Length; d++) {
1066 if (s.IndexOf (diacritics [d]) > 0) {
1067 diacritical [cp] += diacriticWeights [d];
1068 if (s.IndexOf ("COMBINING") >= 0)
1069 diacritical [cp] -= (byte) 2;
1072 // also process "COMBINING blah" here
1073 // For now it is limited to cp < 0x0370
1074 // if (cp < 0x0300 || cp >= 0x0370)
1076 string tmp = diacritics [d].TrimEnd (';');
1077 if (tmp.IndexOf ("WITH ") == 0)
1078 tmp = tmp.Substring (4);
1079 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1081 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1083 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1085 // Two-step grep required for it.
1086 if (s.IndexOf ("FULL STOP") > 0 &&
1087 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1088 diacritical [cp] |= 0xF4;
1090 // Arabic letter name
1091 if (0x0621 <= cp && cp <= 0x064A &&
1092 Char.GetUnicodeCategory ((char) cp)
1093 == UnicodeCategory.OtherLetter) {
1094 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1099 // hamza, waw, yeh ... special cases.
1104 value = 0x77; // special cases.
1107 // Get primary letter name i.e.
1108 // XXX part of ARABIC LETTER XXX yyy
1109 // e.g. that of "TEH MARBUTA" is "TEH".
1112 // 0x0640 is special: it does
1113 // not start with ARABIC LETTER
1115 name.Substring (14);
1116 int tmpIdx = letterName.IndexOf (' ');
1117 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1118 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1119 if (arabicNameMap.ContainsKey (letterName))
1120 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1122 arabicNameMap [letterName] = cp;
1125 arabicLetterPrimaryValues [cp] = value;
1128 // Japanese square letter
1129 if (0x3300 <= cp && cp <= 0x3357)
1130 if (!ExistsJIS (cp))
1131 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1133 // normalizationType
1134 string decomp = values [4];
1135 idx = decomp.IndexOf ('<');
1137 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1139 decompType [cp] = DecompositionFull;
1142 decompType [cp] = DecompositionSub;
1145 decompType [cp] = DecompositionSuper;
1148 decompType [cp] = DecompositionSmall;
1151 decompType [cp] = DecompositionIsolated;
1154 decompType [cp] = DecompositionInitial;
1157 decompType [cp] = DecompositionFinal;
1160 decompType [cp] = DecompositionMedial;
1163 decompType [cp] = DecompositionNoBreak;
1166 decompType [cp] = DecompositionCompat;
1169 decompType [cp] = DecompositionFraction;
1172 decompType [cp] = DecompositionFont;
1175 decompType [cp] = DecompositionCircle;
1178 decompType [cp] = DecompositionSquare;
1181 decompType [cp] = DecompositionWide;
1184 decompType [cp] = DecompositionNarrow;
1187 decompType [cp] = DecompositionVertical;
1190 throw new Exception ("Support NFKD type : " + decomp);
1194 decompType [cp] = DecompositionCanonical;
1195 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1196 if (decomp.Length > 0) {
1198 string [] velems = decomp.Split (' ');
1199 int didx = decompValues.Count;
1200 decompIndex [cp] = didx;
1201 foreach (string v in velems)
1202 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1203 decompLength [cp] = velems.Length;
1205 // [decmpType] -> this_cp
1206 int targetCP = (int) decompValues [didx];
1207 // for "(x)" it specially maps to 'x' .
1208 // FIXME: check if it is sane
1209 if (velems.Length == 3 &&
1210 (int) decompValues [didx] == '(' &&
1211 (int) decompValues [didx + 2] == ')')
1212 targetCP = (int) decompValues [didx + 1];
1213 // special: 0x215F "1/"
1214 else if (cp == 0x215F)
1216 else if (velems.Length > 1 &&
1217 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1218 // skip them, except for CJK ideograph compat
1221 if (targetCP != 0) {
1222 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1223 if (entry == null) {
1224 entry = new Hashtable ();
1225 nfkdMap [targetCP] = entry;
1227 entry [(byte) decompType [cp]] = cp;
1231 if (values [5].Length > 0)
1232 decimalValue [cp] = decimal.Parse (values [5]);
1233 else if (values [6].Length > 0)
1234 decimalValue [cp] = decimal.Parse (values [6]);
1235 else if (values [7].Length > 0) {
1236 string decstr = values [7];
1237 idx = decstr.IndexOf ('/');
1238 if (cp == 0x215F) // special. "1/"
1239 decimalValue [cp] = 0x1;
1243 decimal.Parse (decstr.Substring (0, idx))
1244 / decimal.Parse (decstr.Substring (idx + 1));
1245 else if (decstr [0] == '(' &&
1246 decstr [decstr.Length - 1] == ')')
1249 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1250 else if (decstr [decstr.Length - 1] == '.')
1253 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1255 decimalValue [cp] = decimal.Parse (decstr);
1259 void ParseDerivedCoreProperties (string filename)
1262 using (StreamReader file =
1263 new StreamReader (filename)) {
1264 for (int line = 1; file.Peek () >= 0; line++) {
1266 ProcessDerivedCorePropLine (file.ReadLine ());
1267 } catch (Exception) {
1268 Console.Error.WriteLine ("**** At line " + line);
1275 void ProcessDerivedCorePropLine (string s)
1277 int idx = s.IndexOf ('#');
1279 s = s.Substring (0, idx);
1280 idx = s.IndexOf (';');
1283 string cpspec = s.Substring (0, idx);
1284 idx = cpspec.IndexOf ("..");
1285 NumberStyles nf = NumberStyles.HexNumber |
1286 NumberStyles.AllowTrailingWhite;
1287 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1288 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1289 string value = s.Substring (cpspec.Length + 1).Trim ();
1292 if (cp > char.MaxValue)
1297 for (int x = cp; x <= cpEnd; x++)
1298 isUppercase [x] = true;
1303 void ParseScripts (string filename)
1305 ArrayList gurmukhi = new ArrayList ();
1306 ArrayList gujarati = new ArrayList ();
1307 ArrayList georgian = new ArrayList ();
1308 ArrayList thaana = new ArrayList ();
1310 using (StreamReader file =
1311 new StreamReader (filename)) {
1312 while (file.Peek () >= 0) {
1313 string s = file.ReadLine ();
1314 int idx = s.IndexOf ('#');
1316 s = s.Substring (0, idx);
1317 idx = s.IndexOf (';');
1321 string cpspec = s.Substring (0, idx);
1322 idx = cpspec.IndexOf ("..");
1323 NumberStyles nf = NumberStyles.HexNumber |
1324 NumberStyles.AllowTrailingWhite;
1325 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1326 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1327 string value = s.Substring (cpspec.Length + 1).Trim ();
1330 if (cp > char.MaxValue)
1335 for (int x = cp; x <= cpEnd; x++)
1336 if (!IsIgnorable (x))
1337 gurmukhi.Add ((char) x);
1340 for (int x = cp; x <= cpEnd; x++)
1341 if (!IsIgnorable (x))
1342 gujarati.Add ((char) x);
1345 for (int x = cp; x <= cpEnd; x++)
1346 if (!IsIgnorable (x))
1347 georgian.Add ((char) x);
1350 for (int x = cp; x <= cpEnd; x++)
1351 if (!IsIgnorable (x))
1352 thaana.Add ((char) x);
1357 gurmukhi.Sort (UCAComparer.Instance);
1358 gujarati.Sort (UCAComparer.Instance);
1359 georgian.Sort (UCAComparer.Instance);
1360 thaana.Sort (UCAComparer.Instance);
1361 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1362 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1363 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1364 orderedThaana = (char []) thaana.ToArray (typeof (char));
1367 void ParseJISOrder (string filename)
1371 using (StreamReader file =
1372 new StreamReader (filename)) {
1373 for (;file.Peek () >= 0; line++)
1374 ProcessJISOrderLine (file.ReadLine ());
1376 } catch (Exception) {
1377 Console.Error.WriteLine ("---- line {0}", line);
1382 char [] ws = new char [] {'\t', ' '};
1384 void ProcessJISOrderLine (string s)
1386 int idx = s.IndexOf ('#');
1388 s = s.Substring (0, idx).Trim ();
1391 idx = s.IndexOfAny (ws);
1394 // They start with "0x" so cut them out.
1395 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1396 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1397 jisJapanese.Add (new JISCharacter (cp, jis));
1400 void ParseCJK (string zhXML, string jaXML, string koXML)
1402 XmlDocument doc = new XmlDocument ();
1403 doc.XmlResolver = null;
1410 // Chinese Simplified
1413 offset = 0;//char.MaxValue - arr.Length;
1415 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1417 foreach (char c in s) {
1419 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1421 arr [(int) c - offset] = (ushort) v++;
1427 // Chinese Traditional
1430 offset = 0;//char.MaxValue - arr.Length;
1431 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1433 foreach (char c in s) {
1435 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1437 arr [(int) c - offset] = (ushort) v++;
1446 offset = 0;//char.MaxValue - arr.Length;
1449 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1450 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1451 arr [0x337E] = 0x8005;
1452 arr [0x337D] = 0x8006;
1453 arr [0x337C] = 0x8007;
1456 foreach (JISCharacter jc in jisJapanese) {
1457 if (jc.JIS < 0x8800)
1459 char c = (char) jc.CP;
1462 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1464 arr [(int) c - offset] = (ushort) v++;
1469 if (c == '\u662D') // U+337C
1471 if (c == '\u5927') // U+337D
1473 if (c == '\u5E73') // U+337B
1475 if (c == '\u660E') // U+337E
1477 if (c == '\u9686') // U+F9DC
1480 // FIXME: there are still remaining
1481 // characters after U+FA0C.
1482 // for (int k = 0; k < char.MaxValue; k++) {
1483 for (int k = 0; k < '\uFA0D'; k++) {
1484 if (decompIndex [k] == 0 || IsIgnorable (k))
1486 if (decompValues [decompIndex [k]] == c /*&&
1487 decompLength [k] == 1*/ ||
1488 decompLength [k] == 3 &&
1489 decompValues [decompIndex [k] + 1] == c) {
1490 arr [k - offset] = (ushort) v++;
1499 // Korean weight is somewhat complex. It first shifts
1500 // Hangul category from 52-x to 80-x (they are anyways
1501 // computed). CJK ideographs are placed at secondary
1502 // weight, like XX YY 01 zz 01, where XX and YY are
1503 // corresponding "reset" value and zz is 41,43,45...
1505 // Unlike chs,cht and ja, Korean value is a combined
1506 // ushort which is computed as category
1510 offset = 0;//char.MaxValue - arr.Length;
1512 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1513 XmlElement sc = (XmlElement) reset.NextSibling;
1514 // compute "category" and "level 1" for the
1515 // target "reset" Hangle syllable
1516 char rc = reset.InnerText [0];
1517 int ri = ((int) rc - 0xAC00) + 1;
1519 ((ri / 254) * 256 + (ri % 254) + 2);
1520 // Place the characters after the target.
1523 foreach (char c in s) {
1524 arr [(int) c - offset] = p;
1525 cjkKOlv2 [(int) c - offset] = (byte) v;
1535 void FillIgnorables ()
1537 for (int i = 0; i <= char.MaxValue; i++) {
1538 if (Char.GetUnicodeCategory ((char) i) ==
1539 UnicodeCategory.OtherNotAssigned)
1541 if (IsIgnorable (i))
1542 ignorableFlags [i] |= 1;
1543 if (IsIgnorableSymbol (i))
1544 ignorableFlags [i] |= 2;
1545 if (IsIgnorableNonSpacing (i))
1546 ignorableFlags [i] |= 4;
1550 void ModifyUnidata ()
1552 // Modify some decomposition equivalence
1553 decompType [0xFE31] = 0;
1554 decompIndex [0xFE31] = 0;
1555 decompLength [0xFE31] = 0;
1556 decompType [0xFE32] = 0;
1557 decompIndex [0xFE32] = 0;
1558 decompLength [0xFE32] = 0;
1560 // Korean parens numbers
1561 for (int i = 0x3200; i <= 0x321C; i++)
1562 diacritical [i] = 0xA;
1563 for (int i = 0x3260; i <= 0x327B; i++)
1564 diacritical [i] = 0xC;
1566 // LAMESPEC: these remapping should not be done.
1567 // Windows have incorrect CJK compat mappings.
1568 decompValues [decompIndex [0x32A9]] = 0x91AB;
1569 decompLength [0x323B] = 1;
1570 decompValues [decompIndex [0x323B]] = 0x5B78;
1571 decompValues [decompIndex [0x32AB]] = 0x5B78;
1572 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1573 decompLength [0x3238] = 1;
1574 decompValues [decompIndex [0x3238]] = 0x52DE;
1575 decompValues [decompIndex [0x3298]] = 0x52DE;
1577 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1578 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1579 decompValues [decompIndex [0xFA0C]] = 0x5140;
1580 decompLength [0xFA0C] = 1;
1581 decompIndex [0xF929] = decompLength [0xF929] = 0;
1583 decompValues [decompIndex [0xF92C]] = 0x90DE;
1586 void ModifyParsedValues ()
1588 // number, secondary weights
1590 int [] numarr = numberSecondaryWeightBounds;
1591 for (int i = 0; i < numarr.Length; i += 2, weight++)
1592 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1593 if (Char.IsNumber ((char) cp))
1594 diacritical [cp] = weight;
1596 // Update name part of named characters
1597 for (int i = 0; i < sortableCharNames.Count; i++) {
1598 DictionaryEntry de =
1599 (DictionaryEntry) sortableCharNames [i];
1600 int cp = (int) de.Key;
1601 string renamed = null;
1603 case 0x2101: renamed = "A_1"; break;
1604 case 0x33C3: renamed = "A_2"; break;
1605 case 0x2105: renamed = "C_1"; break;
1606 case 0x2106: renamed = "C_2"; break;
1607 case 0x211E: renamed = "R1"; break;
1608 case 0x211F: renamed = "R2"; break;
1609 // Remove some of them!
1620 sortableCharNames.RemoveAt (i);
1624 if (renamed != null)
1625 sortableCharNames [i] =
1626 new DictionaryEntry (cp, renamed);
1630 void GenerateCore ()
1634 #region Specially ignored // 01
1635 // This will raise "Defined" flag up.
1636 foreach (char c in specialIgnore)
1637 map [(int) c] = new CharMapEntry (0, 0, 0);
1641 #region Variable weights
1642 // Controls : 06 03 - 06 3D
1644 for (int i = 0; i < 65536; i++) {
1645 if (IsIgnorable (i))
1648 uc = Char.GetUnicodeCategory (c);
1649 // NEL is whitespace but not ignored here.
1650 if (uc == UnicodeCategory.Control &&
1651 !Char.IsWhiteSpace (c) || c == '\u0085')
1652 AddCharMap (c, 6, 1);
1656 fillIndex [6] = 0x80;
1657 AddCharMapGroup ('\'', 6, 1, 0);
1658 AddCharMap ('\uFE63', 6, 1);
1660 // Hyphen/Dash : 06 81 - 06 90
1661 for (int i = 0; i < char.MaxValue; i++) {
1662 if (!IsIgnorable (i) &&
1663 Char.GetUnicodeCategory ((char) i) ==
1664 UnicodeCategory.DashPunctuation) {
1665 AddCharMapGroup2 ((char) i, 6, 1, 0);
1667 // SPECIAL: add 2027 and 2043
1668 // Maybe they are regarded the
1669 // same hyphens in "central"
1671 AddCharMap ('\u2027', 6, 1);
1672 AddCharMap ('\u2043', 6, 1);
1677 // Arabic variable weight chars 06 A0 -
1678 fillIndex [6] = 0xA0;
1680 for (int i = 0x64B; i <= 0x650; i++)
1681 AddArabicCharMap ((char) i);
1683 AddCharMapGroup ('\u0652', 6, 1, 0);
1685 AddCharMapGroup ('\u0651', 6, 1, 0);
1689 #region Nonspacing marks // 01
1690 // FIXME: 01 03 - 01 B6 ... annoyance :(
1692 // Combining diacritical marks: 01 DC -
1694 fillIndex [0x1] = 0x41;
1695 for (int i = 0x030E; i <= 0x0326; i++)
1696 if (!IsIgnorable (i))
1697 AddCharMap ((char) i, 0x1, 1);
1698 for (int i = 0x0329; i <= 0x0334; i++)
1699 if (!IsIgnorable (i))
1700 AddCharMap ((char) i, 0x1, 1);
1701 for (int i = 0x0339; i <= 0x0341; i++)
1702 if (!IsIgnorable (i))
1703 AddCharMap ((char) i, 0x1, 1);
1704 fillIndex [0x1] = 0x72;
1705 for (int i = 0x0346; i <= 0x0348; i++)
1706 if (!IsIgnorable (i))
1707 AddCharMap ((char) i, 0x1, 1);
1708 for (int i = 0x02BE; i <= 0x02BF; i++)
1709 if (!IsIgnorable (i))
1710 AddCharMap ((char) i, 0x1, 1);
1711 for (int i = 0x02C1; i <= 0x02C5; i++)
1712 if (!IsIgnorable (i))
1713 AddCharMap ((char) i, 0x1, 1);
1714 for (int i = 0x02CE; i <= 0x02CF; i++)
1715 if (!IsIgnorable (i))
1716 AddCharMap ((char) i, 0x1, 1);
1717 for (int i = 0x02D1; i <= 0x02D3; i++)
1718 if (!IsIgnorable (i))
1719 AddCharMap ((char) i, 0x1, 1);
1720 AddCharMap ('\u02DE', 0x1, 1);
1721 for (int i = 0x02E4; i <= 0x02E9; i++)
1722 if (!IsIgnorable (i))
1723 AddCharMap ((char) i, 0x1, 1);
1725 // FIXME: needs more love here (it should eliminate
1726 // all the hacky code above).
1727 for (int i = 0x0300; i < 0x0370; i++)
1728 if (!IsIgnorable (i) && diacritical [i] != 0
1729 /* especiall here*/ && !map [i].Defined)
1730 map [i] = new CharMapEntry (
1731 0x1, 0x1, diacritical [i]);
1733 fillIndex [0x1] = 0x8D;
1734 // syriac dotted nonspacing marks (1)
1735 AddCharMap ('\u0740', 0x1, 1);
1736 AddCharMap ('\u0741', 0x1, 1);
1737 AddCharMap ('\u0742', 0x1, 1);
1738 // syriac oblique nonspacing marks
1739 AddCharMap ('\u0747', 0x1, 1);
1740 AddCharMap ('\u0748', 0x1, 1);
1741 // syriac dotted nonspacing marks (2)
1742 fillIndex [0x1] = 0x94; // this reset is mandatory
1743 AddCharMap ('\u0732', 0x1, 1);
1744 AddCharMap ('\u0735', 0x1, 1);
1745 AddCharMap ('\u0738', 0x1, 1);
1746 AddCharMap ('\u0739', 0x1, 1);
1747 AddCharMap ('\u073C', 0x1, 1);
1748 // SPECIAL CASES: superscripts
1749 AddCharMap ('\u073F', 0x1, 1);
1750 AddCharMap ('\u0711', 0x1, 1);
1752 for (int i = 0x0743; i <= 0x0746; i++)
1753 AddCharMap ((char) i, 0x1, 1);
1754 for (int i = 0x0730; i <= 0x0780; i++)
1755 if (!map [i].Defined &&
1756 Char.GetUnicodeCategory ((char) i) ==
1757 UnicodeCategory.NonSpacingMark)
1758 AddCharMap ((char) i, 0x1, 1);
1760 // LAMESPEC: It should not stop at '\u20E1'. There are
1761 // a few more characters (that however results in
1762 // overflow of level 2 unless we start before 0xDD).
1763 fillIndex [0x1] = 0xDD;
1764 for (int i = 0x20d0; i <= 0x20e1; i++)
1765 AddCharMap ((char) i, 0x1, 1);
1767 // They are not part of Nonspacing marks, but have
1768 // only diacritical weight.
1769 for (int i = 0x3099; i <= 0x309C; i++)
1770 map [i] = new CharMapEntry (1, 1, 1);
1771 map [0xFF9E] = new CharMapEntry (1, 1, 1);
1772 map [0xFF9F] = new CharMapEntry (1, 1, 2);
1773 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1774 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1775 for (int i = 0x30FC; i <= 0x30FE; i++)
1776 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1781 #region Whitespaces // 07 03 -
1782 fillIndex [0x7] = 0x2;
1783 AddCharMap (' ', 0x7, 2);
1784 AddCharMap ('\u00A0', 0x7, 1);
1785 for (int i = 9; i <= 0xD; i++)
1786 AddCharMap ((char) i, 0x7, 1);
1787 for (int i = 0x2000; i <= 0x200B; i++)
1788 AddCharMap ((char) i, 0x7, 1);
1790 fillIndex [0x7] = 0x17;
1791 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1792 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1794 // Characters which used to represent layout control.
1795 // LAMESPEC: Windows developers seem to have thought
1796 // that those characters are kind of whitespaces,
1797 // while they aren't.
1798 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1799 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1802 // category 09 - continued symbols from 08
1803 fillIndex [0x9] = 2;
1805 for (int cp = 0x2300; cp <= 0x237A; cp++)
1806 AddCharMap ((char) cp, 0x9, 1, 0);
1809 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1810 foreach (DictionaryEntry de in arrowValues) {
1811 int idx = (int) de.Value;
1812 int cp = (int) de.Key;
1813 if (map [cp].Defined)
1815 fillIndex [0x9] = (byte) (0xD8 + idx);
1816 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1820 byte [] boxLv2 = new byte [128];
1821 for (int i = 0; i < boxLv2.Length; i++)
1823 foreach (DictionaryEntry de in boxValues) {
1824 int cp = (int) de.Key;
1825 int off = (int) de.Value;
1826 if (map [cp].Defined)
1829 fillIndex [0x9] = (byte) (0xE5 + off);
1830 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1833 fillIndex [0x9] = (byte) (0xE5 + off);
1834 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1837 // Some special characters (slanted)
1838 fillIndex [0x9] = 0xF4;
1839 AddCharMap ('\u2571', 0x9, 3);
1840 AddCharMap ('\u2572', 0x9, 3);
1841 AddCharMap ('\u2573', 0x9, 3);
1843 // FIXME: implement 0A
1845 fillIndex [0xA] = 2;
1846 // byte currency symbols
1847 for (int cp = 0; cp < 0x100; cp++) {
1848 uc = Char.GetUnicodeCategory ((char) cp);
1849 if (!IsIgnorable (cp) &&
1850 uc == UnicodeCategory.CurrencySymbol &&
1853 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1855 // byte other symbols
1856 for (int cp = 0; cp < 0x100; cp++) {
1858 continue; // SPECIAL: skip FIXME: why?
1859 uc = Char.GetUnicodeCategory ((char) cp);
1860 if (!IsIgnorable (cp) &&
1861 uc == UnicodeCategory.OtherSymbol ||
1862 cp == '\u00B5' || cp == '\u00B7')
1863 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1866 fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1867 for (int cp = 0x2020; cp <= 0x2031; cp++)
1868 if (Char.IsPunctuation ((char) cp))
1869 AddCharMap ((char) cp, 0xA, 1, 0);
1870 // SPECIAL CASES: why?
1871 AddCharMap ('\u203B', 0xA, 1, 0);
1872 AddCharMap ('\u2040', 0xA, 1, 0);
1873 AddCharMap ('\u2041', 0xA, 1, 0);
1874 AddCharMap ('\u2042', 0xA, 1, 0);
1876 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1877 AddCharMap ((char) cp, 0xA, 1, 0);
1878 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1879 for (int cp = 0x2600; cp <= 0x2613; cp++)
1880 AddCharMap ((char) cp, 0xA, 1, 0);
1882 for (int cp = 0x2620; cp <= 0x2770; cp++)
1883 if (Char.IsSymbol ((char) cp))
1884 AddCharMap ((char) cp, 0xA, 1, 0);
1886 for (int i = 0x2440; i < 0x2460; i++)
1887 AddCharMap ((char) i, 0xA, 1, 0);
1891 #region Numbers // 0C 02 - 0C E1
1892 fillIndex [0xC] = 2;
1894 // 9F8 : Bengali "one less than the denominator"
1895 AddCharMap ('\u09F8', 0xC, 1);
1897 ArrayList numbers = new ArrayList ();
1898 for (int i = 0; i < 65536; i++)
1899 if (!IsIgnorable (i) &&
1900 Char.IsNumber ((char) i) &&
1901 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1904 ArrayList numberValues = new ArrayList ();
1905 foreach (int i in numbers)
1906 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1907 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1909 //foreach (DictionaryEntry de in numberValues)
1910 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1912 decimal prevValue = -1;
1913 foreach (DictionaryEntry de in numberValues) {
1914 int cp = (int) de.Key;
1915 decimal currValue = (decimal) de.Value;
1916 bool addnew = false;
1917 if (prevValue < currValue &&
1918 prevValue - (int) prevValue == 0 &&
1922 // Process Hangzhou and Roman numbers
1924 // There are some SPECIAL cases.
1925 if (currValue != 4) // no increment for 4
1929 if (currValue <= 10) {
1930 xcp = (int) prevValue + 0x2170 - 1;
1931 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1932 xcp = (int) prevValue + 0x2160 - 1;
1933 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1934 fillIndex [0xC] += 2;
1935 xcp = (int) prevValue + 0x3021 - 1;
1936 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1939 else if (currValue == 11)
1942 if (prevValue < currValue)
1943 prevValue = currValue;
1944 if (map [cp].Defined)
1946 // HangZhou and Roman are add later
1948 else if (0x3021 <= cp && cp < 0x302A
1949 || 0x2160 <= cp && cp < 0x216A
1950 || 0x2170 <= cp && cp < 0x217A)
1953 if (cp == 0x215B) // FIXME: why?
1954 fillIndex [0xC] += 2;
1955 else if (cp == 0x3021) // FIXME: why?
1957 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1958 if (addnew || cp <= '9') {
1959 int mod = (int) currValue - 1;
1961 if (1 <= currValue && currValue <= 10) {
1963 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1965 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1967 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1969 if (1 <= currValue && currValue <= 20) {
1971 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1973 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1975 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1979 if (cp != 0x09E7 && cp != 0x09EA)
1982 // Add special cases that are not regarded as
1983 // numbers in UnicodeCategory speak.
1986 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1987 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1989 else if (cp == '6') // FIXME: why?
1994 fillIndex [0xC] = 0xFF;
1995 AddCharMap ('\u221E', 0xC, 1);
1998 #region Letters and NonSpacing Marks (general)
2000 // ASCII Latin alphabets
2001 for (int i = 0; i < alphabets.Length; i++)
2002 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2004 // non-ASCII Latin alphabets
2005 // FIXME: there is no such characters that are placed
2006 // *after* "alphabets" array items. This is nothing
2007 // more than a hack that creates dummy weight for
2008 // primary characters.
2009 for (int i = 0x0080; i < 0x0300; i++) {
2010 if (!Char.IsLetter ((char) i))
2012 // For those Latin Letters which has NFKD are
2013 // not added as independent primary character.
2014 if (decompIndex [i] != 0)
2017 // 1.some alphabets have primarily
2018 // equivalent ASCII alphabets.
2019 // 2.some have independent primary weights,
2020 // but inside a-to-z range.
2021 // 3.there are some expanded characters that
2022 // are not part of Unicode Standard NFKD.
2023 // 4. some characters are letter in IsLetter
2024 // but not in sortkeys (maybe unicode version
2025 // difference caused it).
2027 // 1. skipping them does not make sense
2028 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2029 // case 0x184: case 0x185: case 0x186: case 0x189:
2030 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2031 // case 0x194: case 0x195: case 0x196: case 0x19A:
2032 // case 0x19B: case 0x19C:
2033 // 2. skipping them does not make sense
2034 // case 0x14A: // Ng
2035 // case 0x14B: // ng
2039 case 0xDE: // Icelandic Thorn
2040 case 0xFE: // Icelandic Thorn
2041 case 0xDF: // German ss
2042 case 0xFF: // German ss
2044 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2045 // not classified yet
2046 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2047 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2048 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2052 AddCharMapGroup ((char) i, 0xE, 1, 0);
2056 fillIndex [0xF] = 02;
2057 for (int i = 0x0380; i < 0x0390; i++)
2058 if (Char.IsLetter ((char) i))
2059 AddLetterMap ((char) i, 0xF, 1);
2060 fillIndex [0xF] = 02;
2061 for (int i = 0x0391; i < 0x03CF; i++)
2062 if (Char.IsLetter ((char) i))
2063 AddLetterMap ((char) i, 0xF, 1);
2064 fillIndex [0xF] = 0x40;
2065 for (int i = 0x03D0; i < 0x0400; i++)
2066 if (Char.IsLetter ((char) i))
2067 AddLetterMap ((char) i, 0xF, 1);
2070 // Cyrillic letters are sorted like Latin letters i.e.
2071 // containing culture-specific letters between the
2072 // standard Cyrillic sequence.
2074 // We can't use UCA here; it has different sorting.
2075 char [] orderedCyrillic = new char [] {
2076 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2077 '\u0452', // DJE for Serbocroatian
2079 '\u0454', // IE for Ukrainian
2083 '\u0456', // Byelorussian-Ukrainian I
2093 '\u043F', '\u0440', '\u0441', '\u0442',
2094 '\u045B', // TSHE for Serbocroatian
2096 '\u045E', // Short U for Byelorussian
2097 '\u04B1', // Straight U w/ stroke (diacritical!)
2098 '\u0444', '\u0445', '\u0446', '\u0447',
2100 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2101 '\u044D', '\u044E', '\u044F'};
2103 // For some characters here is a map to basic cyrillic
2104 // letters. See UnicodeData.txt character names for
2105 // the sources. Here I simply declare an equiv. array.
2106 // The content characters are map from U+490(,491),
2107 // skipping small letters.
2108 char [] cymap_src = new char [] {
2109 '\u0433', '\u0433', '\u0433', '\u0436',
2110 '\u0437', '\u043A', '\u043A', '\u043A',
2111 '\u043A', '\u043D', '\u043D', '\u043F',
2112 '\u0445', '\u0441', '\u0442', '\u0443',
2113 '\u0443', '\u0445', '\u0446', '\u0447',
2114 '\u0447', '\u0432', '\u0435', '\u0435',
2115 '\u0406', '\u0436', '\u043A', '\u043D',
2116 '\u0447', '\u0435'};
2118 fillIndex [0x10] = 0x8D;
2119 for (int i = 0x0460; i < 0x0481; i++) {
2120 if (Char.IsLetter ((char) i)) {
2122 // U+476/477 have the same
2123 // primary weight as U+474/475.
2124 fillIndex [0x10] -= 3;
2125 AddLetterMap ((char) i, 0x10, 3);
2129 fillIndex [0x10] = 0x6;
2130 for (int i = 0; i < orderedCyrillic.Length; i++) {
2131 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2132 if (!IsIgnorable ((int) c) &&
2133 Char.IsLetter (c) &&
2135 AddLetterMap (c, 0x10, 0);
2136 fillIndex [0x10] += 3;
2140 for (int i = 0; i < cymap_src.Length; i++) {
2141 char c = cymap_src [i];
2142 fillIndex [0x10] = map [c].Level1;
2143 AddLetterMap ((char) (0x0490 + i * 2),
2148 fillIndex [0x11] = 0x3;
2149 for (int i = 0x0531; i < 0x0586; i++)
2150 if (Char.IsLetter ((char) i))
2151 AddLetterMap ((char) i, 0x11, 1);
2155 fillIndex [0x12] = 0x2;
2156 for (int i = 0x05D0; i < 0x05FF; i++)
2157 if (Char.IsLetter ((char) i))
2158 AddLetterMap ((char) i, 0x12, 1);
2160 fillIndex [0x1] = 0x3;
2161 for (int i = 0x0591; i <= 0x05C2; i++) {
2162 if (i == 0x05A3 || i == 0x05BB)
2165 AddCharMap ((char) i, 0x1, 1);
2169 fillIndex [0x1] = 0x8E;
2170 fillIndex [0x13] = 0x3;
2171 for (int i = 0x0621; i <= 0x064A; i++) {
2173 if (Char.GetUnicodeCategory ((char) i)
2174 != UnicodeCategory.OtherLetter) {
2175 // FIXME: arabic nonspacing marks are
2176 // in different order.
2177 AddCharMap ((char) i, 0x1, 1);
2180 // map [i] = new CharMapEntry (0x13,
2181 // (byte) arabicLetterPrimaryValues [i], 1);
2183 (byte) arabicLetterPrimaryValues [i];
2184 byte formDiacritical = 8; // default
2187 case 0x0622: formDiacritical = 9; break;
2188 case 0x0623: formDiacritical = 0xA; break;
2189 case 0x0624: formDiacritical = 5; break;
2190 case 0x0625: formDiacritical = 0xB; break;
2191 case 0x0626: formDiacritical = 7; break;
2192 case 0x0649: formDiacritical = 5; break;
2193 case 0x064A: formDiacritical = 7; break;
2195 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical);
2197 fillIndex [0x13] = 0x84;
2198 for (int i = 0x0674; i < 0x06D6; i++)
2199 if (Char.IsLetter ((char) i))
2200 AddLetterMap ((char) i, 0x13, 1);
2203 // FIXME: it does seem straight codepoint mapping.
2204 fillIndex [0x14] = 04;
2205 for (int i = 0x0901; i < 0x0905; i++)
2206 if (!IsIgnorable (i))
2207 AddLetterMap ((char) i, 0x14, 2);
2208 fillIndex [0x14] = 0xB;
2209 for (int i = 0x0905; i < 0x093A; i++) {
2211 AddCharMap ('\u0929', 0x14, 0, 8);
2213 AddCharMap ('\u0931', 0x14, 0, 8);
2215 AddCharMap ('\u0934', 0x14, 0, 8);
2216 if (Char.IsLetter ((char) i))
2217 AddLetterMap ((char) i, 0x14, 4);
2219 AddCharMap ('\u0960', 0x14, 4);
2221 AddCharMap ('\u0961', 0x14, 4);
2223 fillIndex [0x14] = 0xDA;
2224 for (int i = 0x093E; i < 0x0945; i++)
2225 if (!IsIgnorable (i))
2226 AddLetterMap ((char) i, 0x14, 2);
2227 fillIndex [0x14] = 0xEC;
2228 for (int i = 0x0945; i < 0x094F; i++)
2229 if (!IsIgnorable (i))
2230 AddLetterMap ((char) i, 0x14, 2);
2234 fillIndex [0x15] = 02;
2235 for (int i = 0x0980; i < 0x9FF; i++) {
2236 if (IsIgnorable (i))
2239 fillIndex [0x15] = 0x3B;
2240 switch (Char.GetUnicodeCategory ((char) i)) {
2241 case UnicodeCategory.NonSpacingMark:
2242 case UnicodeCategory.DecimalDigitNumber:
2243 case UnicodeCategory.OtherNumber:
2246 AddLetterMap ((char) i, 0x15, 1);
2249 fillIndex [0x1] = 0x3;
2250 for (int i = 0x0981; i < 0x0A00; i++)
2251 if (Char.GetUnicodeCategory ((char) i) ==
2252 UnicodeCategory.NonSpacingMark)
2253 AddCharMap ((char) i, 0x1, 1);
2255 // Gurmukhi. orderedGurmukhi is from UCA
2256 // FIXME: it does not look equivalent to UCA.
2257 fillIndex [0x16] = 04;
2258 fillIndex [0x1] = 3;
2259 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2260 char c = orderedGurmukhi [i];
2261 if (IsIgnorable ((int) c))
2263 if (IsIgnorableNonSpacing (c)) {
2264 AddLetterMap (c, 0x1, 1);
2267 if (c == '\u0A3C' || c == '\u0A4D' ||
2268 '\u0A66' <= c && c <= '\u0A71')
2270 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2272 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2274 AddLetterMap (c, 0x16, shift);
2277 // Gujarati. orderedGujarati is from UCA
2278 fillIndex [0x17] = 0x4;
2280 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2281 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2282 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2283 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2284 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2285 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2286 // letters go first.
2287 for (int i = 0; i < orderedGujarati.Length; i++) {
2289 char c = orderedGujarati [i];
2290 if (Char.IsLetter (c)) {
2292 if (c == '\u0AB3' || c == '\u0A32')
2294 if (c == '\u0A33') {
2295 AddCharMap ('\u0A32', 0x17, 0);
2296 AddCharMap ('\u0A33', 0x17, 4, 4);
2300 AddCharMap ('\u0AE0', 0x17, 0, 5);
2301 AddCharMap (c, 0x17, 4);
2304 AddCharMap ('\u0AB3', 0x17, 6);
2308 byte gujaratiShift = 4;
2309 fillIndex [0x17] = 0xC0;
2310 for (int i = 0; i < orderedGujarati.Length; i++) {
2311 char c = orderedGujarati [i];
2312 if (fillIndex [0x17] == 0xCC)
2314 if (!Char.IsLetter (c)) {
2317 AddCharMap ('\u0A81', 0x17, 2);
2320 AddLetterMap (c, 0x17, gujaratiShift);
2325 fillIndex [0x1] = 03;
2326 fillIndex [0x18] = 02;
2327 for (int i = 0x0B00; i < 0x0B7F; i++) {
2328 switch (Char.GetUnicodeCategory ((char) i)) {
2329 case UnicodeCategory.NonSpacingMark:
2330 case UnicodeCategory.DecimalDigitNumber:
2331 AddLetterMap ((char) i, 0x1, 1);
2334 AddLetterMap ((char) i, 0x18, 1);
2338 fillIndex [0x19] = 2;
2339 AddCharMap ('\u0BD7', 0x19, 0);
2340 fillIndex [0x19] = 0xA;
2342 for (int i = 0x0B82; i <= 0x0B94; i++)
2343 if (!IsIgnorable ((char) i))
2344 AddCharMap ((char) i, 0x19, 2);
2346 fillIndex [0x19] = 0x28;
2347 // The array for Tamil consonants is a constant.
2348 // Windows have almost similar sequence to TAM from
2349 // tamilnet but a bit different in Grantha.
2350 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2351 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2353 fillIndex [0x19] = 0x82;
2354 for (int i = 0x0BBE; i < 0x0BCD; i++)
2355 if (Char.GetUnicodeCategory ((char) i) ==
2356 UnicodeCategory.SpacingCombiningMark
2358 AddLetterMap ((char) i, 0x19, 2);
2361 fillIndex [0x1A] = 0x4;
2362 for (int i = 0x0C00; i < 0x0C62; i++) {
2363 if (i == 0x0C55 || i == 0x0C56)
2365 AddCharMap ((char) i, 0x1A, 3);
2366 char supp = (i == 0x0C0B) ? '\u0C60':
2367 i == 0x0C0C ? '\u0C61' : char.MinValue;
2368 if (supp == char.MinValue)
2370 AddCharMap (supp, 0x1A, 3);
2374 fillIndex [0x1B] = 4;
2375 for (int i = 0x0C80; i < 0x0CE5; i++) {
2376 if (i == 0x0CD5 || i == 0x0CD6)
2378 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2379 continue; // shift after 0xCB9
2380 AddCharMap ((char) i, 0x1B, 3);
2382 // SPECIAL CASES: but why?
2383 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2384 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2385 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2388 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2392 fillIndex [0x1C] = 2;
2393 fillIndex [0x1] = 3;
2394 for (int i = 0x0D02; i < 0x0D61; i++) {
2395 // FIXME: I avoided MSCompatUnicodeTable usage
2396 // here (it results in recursion). So check if
2397 // using NonSpacingMark makes sense or not.
2398 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2399 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2400 AddCharMap ((char) i, 0x1C, 1);
2401 else if (!IsIgnorable ((char) i))
2402 AddCharMap ((char) i, 1, 1);
2405 // Thai ... note that it breaks 0x1E wall after E2B!
2406 // Also, all Thai characters have level 2 value 3.
2407 fillIndex [0x1E] = 2;
2408 fillIndex [0x1] = 3;
2409 for (int i = 0xE40; i <= 0xE44; i++)
2410 AddCharMap ((char) i, 0x1E, 1, 3);
2411 for (int i = 0xE01; i < 0xE2B; i++)
2412 AddCharMap ((char) i, 0x1E, 6, 3);
2413 fillIndex [0x1F] = 5;
2414 for (int i = 0xE2B; i < 0xE30; i++)
2415 AddCharMap ((char) i, 0x1F, 6, 3);
2416 fillIndex [0x1F] = 0x1E;
2417 for (int i = 0xE30; i < 0xE3B; i++)
2418 AddCharMap ((char) i, 0x1F, 1, 3);
2419 // some Thai characters remains.
2420 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2421 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2422 foreach (char c in specialThai)
2423 AddCharMap (c, 0x1F, 1, 3);
2425 for (int i = 0xE00; i < 0xE80; i++)
2426 if (Char.GetUnicodeCategory ((char) i) ==
2427 UnicodeCategory.NonSpacingMark)
2428 AddCharMap ((char) i, 1, 1);
2431 fillIndex [0x1F] = 2;
2432 fillIndex [0x1] = 3;
2433 for (int i = 0xE80; i < 0xEDF; i++) {
2434 if (IsIgnorable ((char) i))
2436 else if (Char.IsLetter ((char) i))
2437 AddCharMap ((char) i, 0x1F, 1);
2438 else if (Char.GetUnicodeCategory ((char) i) ==
2439 UnicodeCategory.NonSpacingMark)
2440 AddCharMap ((char) i, 1, 1);
2443 // Georgian. orderedGeorgian is from UCA DUCET.
2444 fillIndex [0x21] = 5;
2445 for (int i = 0; i < orderedGeorgian.Length; i++) {
2446 char c = orderedGeorgian [i];
2447 if (map [(int) c].Defined)
2449 AddCharMap (c, 0x21, 0);
2451 AddCharMap ((char) (c - 0x30), 0x21, 0);
2452 fillIndex [0x21] += 5;
2456 fillIndex [0x22] = 2;
2457 int kanaOffset = 0x3041;
2458 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2460 for (int gyo = 0; gyo < 9; gyo++) {
2461 for (int dan = 0; dan < 5; dan++) {
2462 if (gyo == 7 && dan % 2 == 1) {
2465 kanaOffset -= 2; // There is no space for yi and ye.
2468 int cp = kanaOffset + dan * kanaLines [gyo];
2469 // small lines (a-gyo, ya-gyo)
2470 if (gyo == 0 || gyo == 7) {
2471 AddKanaMap (cp, 1); // small
2472 AddKanaMap (cp + 1, 1);
2475 AddKanaMap (cp, kanaLines [gyo]);
2479 // add small 'ka' (before normal one)
2480 AddKanaMap (0x30F5, 1);
2484 // add small 'ke' (before normal one)
2485 AddKanaMap (0x30F6, 1);
2489 // add small 'Tsu' (before normal one)
2490 AddKanaMap (0x3063, 1);
2494 fillIndex [0x22] += 3;
2495 kanaOffset += 5 * kanaLines [gyo];
2498 // Wa-gyo is almost special, so I just manually add.
2499 AddLetterMap ((char) 0x308E, 0x22, 0);
2500 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2501 AddLetterMap ((char) 0x308F, 0x22, 0);
2502 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2504 AddLetterMap ((char) 0x3090, 0x22, 0);
2505 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2506 fillIndex [0x22] += 2;
2507 // no "Wu" in Japanese.
2508 AddLetterMap ((char) 0x3091, 0x22, 0);
2509 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2511 AddLetterMap ((char) 0x3092, 0x22, 0);
2512 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2514 fillIndex [0x22] = 0x80;
2515 AddLetterMap ((char) 0x3093, 0x22, 0);
2516 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2518 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2519 map [0x30A6].Level1, 3);// voiced hiragana U
2520 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2521 map [0x30A6].Level1, 3);// voiced katakana U
2523 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2524 map [0x30AB].Level1, 0);// small katakana Ka
2525 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2526 map [0x30B1].Level1, 0);// small katakana Ke
2528 for (int i = 0x30F7; i < 0x30FB; i++)
2529 map [i] = new CharMapEntry (map [i - 8].Category,
2533 // JIS Japanese square chars.
2534 fillIndex [0x22] = 0x97;
2535 jisJapanese.Sort (JISComparer.Instance);
2536 foreach (JISCharacter j in jisJapanese)
2537 if (0x3300 <= j.CP && j.CP <= 0x3357)
2538 AddCharMap ((char) j.CP, 0x22, 1);
2539 // non-JIS Japanese square chars.
2540 nonJisJapanese.Sort (NonJISComparer.Instance);
2541 foreach (NonJISCharacter j in nonJisJapanese)
2542 AddCharMap ((char) j.CP, 0x22, 1);
2545 fillIndex [0x23] = 0x02;
2546 for (int i = 0x3105; i <= 0x312C; i++)
2547 AddCharMap ((char) i, 0x23, 1);
2549 // Estrangela: ancient Syriac
2550 fillIndex [0x24] = 0x0B;
2551 // FIXME: is 0x71E really alternative form?
2552 ArrayList syriacAlternatives = new ArrayList (
2553 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2554 for (int i = 0x0710; i <= 0x072C; i++) {
2555 if (i == 0x0711) // NonSpacingMark
2557 if (syriacAlternatives.Contains (i))
2559 AddCharMap ((char) i, 0x24, 4);
2564 foreach (int cp in syriacAlternatives)
2565 map [cp] = new CharMapEntry (0x24,
2566 (byte) (map [cp - 1].Level1 + 2),
2568 // FIXME: Syriac NonSpacingMark should go here.
2571 // FIXME: it turned out that it does not look like UCA
2572 fillIndex [0x24] = 0x6E;
2573 for (int i = 0; i < orderedThaana.Length; i++) {
2574 char c = orderedThaana [i];
2575 if (IsIgnorableNonSpacing ((int) c))
2577 AddCharMap (c, 0x24, 2);
2578 if (c == '\u0782') // SPECIAL CASE: why?
2579 fillIndex [0x24] += 2;
2583 // FIXME: Add more culture-specific letters (that are
2584 // not supported in Windows collation) here.
2586 // Surrogate ... they are computed.
2591 // Unlike UCA Windows Hangul sequence mixes Jongseong
2592 // with Choseong sequence as well as Jungseong,
2593 // adjusted to have the same primary weight for the
2594 // same base character. So it is impossible to compute
2597 // Here I introduce an ordered sequence of mixed
2598 // 'commands' and 'characters' that is similar to
2600 // - ',' increases primary weight.
2601 // - [A B] means a range, increasing index
2602 // - {A B} means a range, without increasing index
2603 // - '=' is no operation (it means the characters
2604 // of both sides have the same weight).
2605 // - '>' inserts a Hangul Syllable block that
2606 // contains 0x251 characters.
2607 // - '<' decreases the index
2608 // - '0'-'9' means skip count
2609 // - whitespaces are ignored
2612 string hangulSequence =
2613 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2614 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2615 + "<{\u1113 \u1116}, \u3165,"
2616 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2617 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2618 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2619 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2620 + "[\u11D1 \u11D2], \u11B2,"
2621 + "[\u11D3 \u11D5], \u11B3,"
2622 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2623 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2624 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2625 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2626 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2627 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2628 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2629 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2630 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2631 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2632 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2633 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2634 + "\u11F1,, \u11F2,,,"
2635 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2636 + "<\u114D, \u110D,, >"
2637 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2638 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2639 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2640 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2641 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2645 byte hangulCat = 0x52;
2646 fillIndex [hangulCat] = 0x2;
2648 int syllableBlock = 0;
2649 for (int n = 0; n < hangulSequence.Length; n++) {
2650 char c = hangulSequence [n];
2652 if (Char.IsWhiteSpace (c))
2658 IncrementSequentialIndex (ref hangulCat);
2661 if (fillIndex [hangulCat] == 2)
2662 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2663 fillIndex [hangulCat]--;
2666 IncrementSequentialIndex (ref hangulCat);
2667 for (int l = 0; l < 0x15; l++)
2668 for (int v = 0; v < 0x1C; v++) {
2670 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2671 IncrementSequentialIndex (ref hangulCat);
2676 start = hangulSequence [n + 1];
2677 end = hangulSequence [n + 3];
2678 for (int i = start; i <= end; i++) {
2679 AddCharMap ((char) i, hangulCat, 0);
2681 IncrementSequentialIndex (ref hangulCat);
2683 n += 4; // consumes 5 characters for this operation
2686 start = hangulSequence [n + 1];
2687 end = hangulSequence [n + 3];
2688 for (int i = start; i <= end; i++)
2689 AddCharMap ((char) i, hangulCat, 0);
2690 n += 4; // consumes 5 characters for this operation
2693 AddCharMap (c, hangulCat, 0);
2699 for (int i = 0x3200; i < 0x3300; i++) {
2700 if (IsIgnorable (i) || map [i].Defined)
2704 if (decompLength [i] == 4 &&
2705 decompValues [decompIndex [i]] == '(')
2706 ch = decompIndex [i] + 1;
2708 else if (decompLength [i] == 2 &&
2709 decompValues [decompIndex [i] + 1] == '\u1161')
2710 ch = decompIndex [i];
2711 else if (decompLength [i] == 1)
2712 ch = decompIndex [i];
2715 ch = decompValues [ch];
2716 if (ch < 0x1100 || 0x1200 < ch &&
2717 ch < 0xAC00 || 0xD800 < ch)
2721 int offset = i < 0x3260 ? 1 : 0;
2722 if (0x326E <= i && i <= 0x3273)
2725 map [i] = new CharMapEntry (map [ch].Category,
2726 (byte) (map [ch].Level1 + offset),
2728 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2734 // Letterlike characters and CJK compatibility square
2735 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2736 int [] counts = new int ['Z' - 'A' + 1];
2737 char [] namedChars = new char [sortableCharNames.Count];
2739 foreach (DictionaryEntry de in sortableCharNames) {
2740 counts [((string) de.Value) [0] - 'A']++;
2741 namedChars [nCharNames++] = (char) ((int) de.Key);
2743 nCharNames = 0; // reset
2744 for (int a = 0; a < counts.Length; a++) {
2745 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2746 for (int i = 0; i < counts [a]; i++)
2747 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2748 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2751 // CJK unified ideograph.
2753 fillIndex [cjkCat] = 0x2;
2754 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2755 if (!IsIgnorable (cp))
2756 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2757 // CJK Extensions goes here.
2758 // LAMESPEC: With this Windows style CJK layout, it is
2759 // impossible to add more CJK ideograph i.e. 0x9FA6-
2760 // 0x9FBB can never be added w/o breaking compat.
2761 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2762 if (!IsIgnorable (cp))
2763 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2765 // PrivateUse ... computed.
2766 // remaining Surrogate ... computed.
2768 #region Special "biggest" area (FF FF)
2769 fillIndex [0xFF] = 0xFF;
2770 char [] specialBiggest = new char [] {
2771 '\u3005', '\u3031', '\u3032', '\u309D',
2772 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2773 '\uFE7C', '\uFE7D', '\uFF70'};
2774 foreach (char c in specialBiggest)
2775 AddCharMap (c, 0xFF, 0);
2778 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2779 // non-alphanumeric ASCII except for: + - < = > '
2780 for (int i = 0x21; i < 0x7F; i++) {
2781 if (Char.IsLetterOrDigit ((char) i)
2782 || "+-<=>'".IndexOf ((char) i) >= 0)
2783 continue; // they are not added here.
2784 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2785 // Insert 3001 after ',' and 3002 after '.'
2787 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2789 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2791 AddCharMap ('\uFE30', 0x7, 1, 0);
2795 #region 07 - Punctuations and something else
2796 for (int i = 0xA0; i < char.MaxValue; i++) {
2797 if (IsIgnorable (i))
2800 // FIXME: actually those reset should not be
2801 // done but here I put for easy goal.
2803 fillIndex [0x7] = 0xE2;
2805 fillIndex [0x7] = 0x77;
2817 switch (Char.GetUnicodeCategory ((char) i)) {
2818 case UnicodeCategory.OtherPunctuation:
2819 case UnicodeCategory.ClosePunctuation:
2820 case UnicodeCategory.OpenPunctuation:
2821 case UnicodeCategory.InitialQuotePunctuation:
2822 case UnicodeCategory.FinalQuotePunctuation:
2823 case UnicodeCategory.ModifierSymbol:
2824 // SPECIAL CASES: // 0xA
2825 if (0x2020 <= i && i <= 0x2031)
2827 AddCharMapGroup ((char) i, 0x7, 1, 0);
2830 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2831 goto case UnicodeCategory.OtherPunctuation;
2836 // FIXME: it should not need to reset level 1, but
2837 // it's for easy goal.
2838 fillIndex [0x7] = 0xB6;
2839 for (int i = 0x2400; i <= 0x2421; i++)
2840 AddCharMap ((char) i, 0x7, 1, 0);
2843 // FIXME: for 07 xx we need more love.
2845 // Characters w/ diacritical marks (NFKD)
2846 for (int i = 0; i <= char.MaxValue; i++) {
2847 if (map [i].Defined || IsIgnorable (i))
2849 if (decompIndex [i] == 0)
2852 int start = decompIndex [i];
2853 int primaryChar = decompValues [start];
2856 int length = decompLength [i];
2857 // special processing for parenthesized ones.
2859 decompValues [start] == '(' &&
2860 decompValues [start + 2] == ')') {
2861 primaryChar = decompValues [start + 1];
2865 if (map [primaryChar].Level1 == 0)
2868 for (int l = 1; l < length; l++) {
2869 int c = decompValues [start + l];
2870 if (map [c].Level1 != 0)
2872 secondary += diacritical [c];
2876 map [i] = new CharMapEntry (
2877 map [primaryChar].Category,
2878 map [primaryChar].Level1,
2883 // category 08 - symbols
2884 fillIndex [0x8] = 2;
2885 // Here Windows mapping is not straightforward. It is
2886 // not based on computation but seems manual sorting.
2887 AddCharMapGroup ('+', 0x8, 1, 0); // plus
2888 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2889 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2890 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2891 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2892 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2893 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2894 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2895 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2896 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2897 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2898 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2899 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2901 for (int cp = 0; cp < 0x2300; cp++) {
2902 if (cp == 0xAC) // SPECIAL CASE: skip
2905 cp = 0x2200; // skip to 2200
2906 fillIndex [0x8] = 0x21;
2909 fillIndex [0x8] = 0x3;
2911 fillIndex [0x8] = 0xB9;
2912 if (!map [cp].Defined &&
2913 // Char.GetUnicodeCategory ((char) cp) ==
2914 // UnicodeCategory.MathSymbol)
2915 Char.IsSymbol ((char) cp))
2916 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2917 // SPECIAL CASES: no idea why Windows sorts as such
2920 AddCharMap ('\u227B', 0x8, 1, 0);
2921 AddCharMap ('\u22B1', 0x8, 1, 0);
2924 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2925 AddCharMapGroup ('\u226A', 0x8, 1, 0);
2926 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2927 AddCharMapGroup ('\u226B', 0x8, 1, 0);
2930 AddCharMap ('\u01C0', 0x8, 1, 0);
2931 AddCharMap ('\u01C1', 0x8, 1, 0);
2932 AddCharMap ('\u01C2', 0x8, 1, 0);
2937 #region Level2 adjustment
2939 diacritical [0x624] = 0x5;
2940 diacritical [0x626] = 0x7;
2941 diacritical [0x622] = 0x9;
2942 diacritical [0x623] = 0xA;
2943 diacritical [0x625] = 0xB;
2944 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2945 diacritical [0x64A] = 0x7; // Yaa'
2947 for (int i = 0; i < char.MaxValue; i++) {
2949 byte cat = map [i].Category;
2951 case 0xE: // Latin diacritics
2952 case 0x22: // Japanese: circled characters
2953 mod = diacritical [i];
2955 case 0x13: // Arabic
2956 if (diacritical [i] == 0 && i >= 0xFE8D)
2957 mod = 0x8; // default for arabic
2960 if (0x52 <= cat && cat <= 0x7F) // Hangul
2961 mod = diacritical [i];
2963 map [i] = new CharMapEntry (
2964 cat, map [i].Level1, mod);
2968 // FIXME: this is hack but those NonSpacingMark
2969 // characters and still undefined are likely to
2971 for (int i = 0; i < char.MaxValue; i++)
2972 if (!map [i].Defined &&
2974 Char.GetUnicodeCategory ((char) i) ==
2975 UnicodeCategory.NonSpacingMark)
2976 AddCharMap ((char) i, 1, 1);
2978 // FIXME: this is hack but those Symbol characters
2979 // are likely to fall into 0xA category.
2980 for (int i = 0; i < char.MaxValue; i++)
2981 if (!map [i].Defined &&
2983 Char.IsSymbol ((char) i))
2984 AddCharMap ((char) i, 0xA, 1);
2987 private void IncrementSequentialIndex (ref byte hangulCat)
2989 fillIndex [hangulCat]++;
2990 if (fillIndex [hangulCat] == 0) { // overflown
2992 fillIndex [hangulCat] = 0x2;
2996 // Reset fillIndex to fixed value and call AddLetterMap().
2997 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2999 fillIndex [category] = alphaWeight;
3000 AddLetterMap (c, category, 0);
3002 ArrayList al = latinMap [c] as ArrayList;
3006 foreach (int cp in al)
3007 AddLetterMap ((char) cp, category, 0);
3010 private void AddKanaMap (int i, byte voices)
3012 for (byte b = 0; b < voices; b++) {
3013 char c = (char) (i + b);
3014 byte arg = (byte) (b > 0 ? b + 2 : 0);
3016 AddLetterMapCore (c, 0x22, 0, arg);
3018 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
3022 private void AddLetterMap (char c, byte category, byte updateCount)
3024 AddLetterMapCore (c, category, updateCount, 0);
3027 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
3030 // <small> updates index
3031 c2 = ToSmallForm (c);
3033 AddCharMapGroup (c2, category, updateCount, level2);
3034 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3035 if (c2 != c && !map [(int) c2].Defined)
3036 AddLetterMapCore (c2, category, 0, level2);
3037 bool doUpdate = true;
3038 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3041 AddCharMapGroup (c, category, 0, level2);
3043 fillIndex [category] += updateCount;
3046 private bool AddCharMap (char c, byte category, byte increment)
3048 return AddCharMap (c, category, increment, 0);
3051 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3053 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3054 return false; // do nothing
3055 map [(int) c] = new CharMapEntry (category,
3056 category == 1 ? alt : fillIndex [category],
3057 category == 1 ? fillIndex [category] : alt);
3058 fillIndex [category] += increment;
3062 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3064 char c2 = ToSmallFormTail (c);
3066 AddCharMap (c2, category, updateCount, 0);
3068 AddCharMap (c, category, updateCount, 0);
3070 c2 = ToFullWidthTail (c);
3072 AddCharMapGroupTail (c2, category, updateCount);
3076 // Adds characters to table in the order below
3077 // (+ increases weight):
3081 // <full> | <super> | <sub>
3082 // <circle> | <wide> (| <narrow>)
3086 // level2 is fixed (does not increase).
3087 int [] sameWeightItems = new int [] {
3088 DecompositionFraction,
3092 DecompositionCircle,
3094 DecompositionNarrow,
3096 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3098 if (map [(int) c].Defined)
3101 char small = char.MinValue;
3102 char vertical = char.MinValue;
3103 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3105 object smv = nfkd [(byte) DecompositionSmall];
3107 small = (char) ((int) smv);
3108 object vv = nfkd [(byte) DecompositionVertical];
3110 vertical = (char) ((int) vv);
3113 // <small> updates index
3114 if (small != char.MinValue)
3115 AddCharMap (small, category, updateCount);
3118 AddCharMap (c, category, 0, level2);
3121 foreach (int weight in sameWeightItems) {
3122 object wv = nfkd [(byte) weight];
3124 AddCharMap ((char) ((int) wv), category, 0, level2);
3128 // update index here.
3129 fillIndex [category] += updateCount;
3131 if (vertical != char.MinValue)
3132 AddCharMap (vertical, category, updateCount, level2);
3135 private void AddCharMapCJK (char c, ref byte category)
3137 AddCharMap (c, category, 0, 0);
3138 IncrementSequentialIndex (ref category);
3140 // Special. I wonder why but Windows skips 9E F9.
3141 if (category == 0x9E && fillIndex [category] == 0xF9)
3142 IncrementSequentialIndex (ref category);
3145 private void AddCharMapGroupCJK (char c, ref byte category)
3147 AddCharMapCJK (c, ref category);
3149 // LAMESPEC: see below.
3150 if (c == '\u5B78') {
3151 AddCharMapCJK ('\u32AB', ref category);
3152 AddCharMapCJK ('\u323B', ref category);
3154 if (c == '\u52DE') {
3155 AddCharMapCJK ('\u3298', ref category);
3156 AddCharMapCJK ('\u3238', ref category);
3159 AddCharMapCJK ('\u32A2', ref category);
3161 // Especially this mapping order totally does
3162 // not make sense to me.
3163 AddCharMapCJK ('\u32A9', ref category);
3165 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3168 for (byte weight = 0; weight <= 0x12; weight++) {
3169 object wv = nfkd [weight];
3174 // Special: they are ignored in this area.
3175 // FIXME: check if it is sane
3176 if (0xF900 <= w && w <= 0xFAD9)
3178 // LAMESPEC: on Windows some of CJK characters
3179 // in 3200-32B0 are incorrectly mapped. They
3180 // mix Chinise and Japanese Kanji when
3181 // ordering those characters.
3183 case 0x32A2: case 0x3298: case 0x3238:
3184 case 0x32A9: case 0x323B: case 0x32AB:
3188 AddCharMapCJK ((char) w, ref category);
3192 // For now it is only for 0x7 category.
3193 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3195 char small = char.MinValue;
3196 char vertical = char.MinValue;
3197 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3199 object smv = nfkd [(byte) DecompositionSmall];
3201 small = (char) ((int) smv);
3202 object vv = nfkd [(byte) DecompositionVertical];
3204 vertical = (char) ((int) vv);
3207 // <small> updates index
3208 if (small != char.MinValue)
3209 // SPECIAL CASE excluded (FIXME: why?)
3210 if (small != '\u2024')
3211 AddCharMap (small, category, updateCount);
3214 AddCharMap (c, category, updateCount, level2);
3216 // Since nfkdMap is problematic to have two or more
3217 // NFKD to an identical character, here I iterate all.
3218 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3219 if (decompLength [c2] == 1 &&
3220 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3221 switch (decompType [c2]) {
3222 case DecompositionCompat:
3223 AddCharMap ((char) c2, category, updateCount, level2);
3229 if (vertical != char.MinValue)
3230 // SPECIAL CASE excluded (FIXME: why?)
3231 if (vertical != '\uFE33' && vertical != '\uFE34')
3232 AddCharMap (vertical, category, updateCount, level2);
3235 private void AddArabicCharMap (char c)
3238 byte updateCount = 1;
3242 AddCharMap (c, category, 0, level2);
3244 // Since nfkdMap is problematic to have two or more
3245 // NFKD to an identical character, here I iterate all.
3246 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3247 if (decompLength [c2] == 0)
3249 int idx = decompIndex [c2] + decompLength [c2] - 1;
3250 if ((int) (decompValues [idx]) == (int) c)
3251 AddCharMap ((char) c2, category,
3254 fillIndex [category] += updateCount;
3257 char ToFullWidth (char c)
3259 return ToDecomposed (c, DecompositionFull, false);
3262 char ToFullWidthTail (char c)
3264 return ToDecomposed (c, DecompositionFull, true);
3267 char ToSmallForm (char c)
3269 return ToDecomposed (c, DecompositionSmall, false);
3272 char ToSmallFormTail (char c)
3274 return ToDecomposed (c, DecompositionSmall, true);
3277 char ToDecomposed (char c, byte d, bool tail)
3279 if (decompType [(int) c] != d)
3281 int idx = decompIndex [(int) c];
3283 idx += decompLength [(int) c] - 1;
3284 return (char) decompValues [idx];
3287 bool ExistsJIS (int cp)
3289 foreach (JISCharacter j in jisJapanese)
3297 #region Level 3 properties (Case/Width)
3299 private byte ComputeLevel3Weight (char c)
3301 byte b = ComputeLevel3WeightRaw (c);
3302 return b > 0 ? (byte) (b + 2) : b;
3305 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3308 if ('\u3192' <= c && c <= '\u319F')
3310 // Japanese reading marks
3311 if (c == '\u3001' || c == '\u3002')
3314 if ('\u11A8' <= c && c <= '\u11F9')
3316 if ('\uFFA0' <= c && c <= '\uFFDC')
3318 if ('\u3130' <= c && c <= '\u3164')
3320 if ('\u3165' <= c && c <= '\u318E')
3322 // Georgian Capital letters
3323 if ('\u10A0' <= c && c <= '\u10C5')
3326 if ('\u2776' <= c && c <= '\u277F')
3328 if ('\u2780' <= c && c <= '\u2789')
3330 if ('\u2776' <= c && c <= '\u2793')
3332 if ('\u2160' <= c && c <= '\u216F')
3334 if ('\u2181' <= c && c <= '\u2182')
3337 if ('\u2135' <= c && c <= '\u2138')
3339 if ('\uFE80' <= c && c < '\uFF00') {
3340 // 2(Isolated)/8(Final)/0x18(Medial)
3341 switch (decompType [(int) c]) {
3342 case DecompositionIsolated:
3344 case DecompositionFinal:
3346 case DecompositionMedial:
3351 // actually I dunno the reason why they have weights.
3374 switch (decompType [(int) c]) {
3375 case DecompositionWide: // <wide>
3376 case DecompositionSub: // <sub>
3377 case DecompositionSuper: // <super>
3378 ret |= decompType [(int) c];
3381 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3383 if (isUppercase [(int) c]) // DerivedCoreProperties
3393 static bool IsIgnorable (int i)
3395 if (unicodeAge [i] >= 3.1)
3397 switch (char.GetUnicodeCategory ((char) i)) {
3398 case UnicodeCategory.OtherNotAssigned:
3399 case UnicodeCategory.Format:
3406 // FIXME: In the future use DerivedAge.txt to examine character
3407 // versions and set those ones that have higher version than
3408 // 1.0 as ignorable.
3409 static bool IsIgnorable (int i)
3413 // I guess, those characters are added between
3414 // Unicode 1.0 (LCMapString) and Unicode 3.1
3415 // (UnicodeCategory), so they used to be
3416 // something like OtherNotAssigned as of Unicode 1.1.
3417 case 0x2df: case 0x387:
3418 case 0x3d7: case 0x3d8: case 0x3d9:
3419 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3420 case 0x400: case 0x40d: case 0x450: case 0x45d:
3421 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3422 case 0x653: case 0x654: case 0x655: case 0x66d:
3424 case 0x1e9b: case 0x202f: case 0x20ad:
3425 case 0x20ae: case 0x20af:
3426 case 0x20e2: case 0x20e3:
3427 case 0x2139: case 0x213a: case 0x2183:
3428 case 0x2425: case 0x2426: case 0x2619:
3429 case 0x2670: case 0x2671: case 0x3007:
3430 case 0x3190: case 0x3191:
3431 case 0xfffc: case 0xfffd:
3433 // exceptional characters filtered by the
3434 // following conditions. Originally those exceptional
3435 // ranges are incorrect (they should not be ignored)
3436 // and most of those characters are unfortunately in
3438 case 0x4d8: case 0x4d9:
3439 case 0x4e8: case 0x4e9:
3441 case 0x3036: case 0x303f:
3442 case 0x337b: case 0xfb1e:
3447 // The whole Sinhala characters.
3448 0x0D82 <= i && i <= 0x0DF4
3449 // The whole Tibetan characters.
3450 || 0x0F00 <= i && i <= 0x0FD1
3451 // The whole Myanmar characters.
3452 || 0x1000 <= i && i <= 0x1059
3453 // The whole Etiopic, Cherokee,
3454 // Canadian Syllablic, Ogham, Runic,
3455 // Tagalog, Hanunoo, Philippine,
3456 // Buhid, Tagbanwa, Khmer and Mongorian
3458 || 0x1200 <= i && i <= 0x1DFF
3459 // Greek extension characters.
3460 || 0x1F00 <= i && i <= 0x1FFF
3461 // The whole Braille characters.
3462 || 0x2800 <= i && i <= 0x28FF
3463 // CJK radical characters.
3464 || 0x2E80 <= i && i <= 0x2EF3
3465 // Kangxi radical characters.
3466 || 0x2F00 <= i && i <= 0x2FD5
3467 // Ideographic description characters.
3468 || 0x2FF0 <= i && i <= 0x2FFB
3469 // Bopomofo letter and final
3470 || 0x31A0 <= i && i <= 0x31B7
3471 // White square with quadrant characters.
3472 || 0x25F0 <= i && i <= 0x25F7
3473 // Ideographic telegraph symbols.
3474 || 0x32C0 <= i && i <= 0x32CB
3475 || 0x3358 <= i && i <= 0x3370
3476 || 0x33E0 <= i && i <= 0x33FF
3477 // The whole YI characters.
3478 || 0xA000 <= i && i <= 0xA48C
3479 || 0xA490 <= i && i <= 0xA4C6
3480 // American small ligatures
3481 || 0xFB13 <= i && i <= 0xFB17
3482 // hebrew, arabic, variation selector.
3483 || 0xFB1D <= i && i <= 0xFE2F
3484 // Arabic ligatures.
3485 || 0xFEF5 <= i && i <= 0xFEFC
3486 // FIXME: why are they excluded?
3487 || 0x01F6 <= i && i <= 0x01F9
3488 || 0x0218 <= i && i <= 0x0233
3489 || 0x02A9 <= i && i <= 0x02AD
3490 || 0x02EA <= i && i <= 0x02EE
3491 || 0x0349 <= i && i <= 0x036F
3492 || 0x0488 <= i && i <= 0x048F
3493 || 0x04D0 <= i && i <= 0x04FF
3494 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3495 || 0x06D6 <= i && i <= 0x06ED
3496 || 0x06FA <= i && i <= 0x06FE
3497 || 0x2048 <= i && i <= 0x204D
3498 || 0x20e4 <= i && i <= 0x20ea
3499 || 0x213C <= i && i <= 0x214B
3500 || 0x21EB <= i && i <= 0x21FF
3501 || 0x22F2 <= i && i <= 0x22FF
3502 || 0x237B <= i && i <= 0x239A
3503 || 0x239B <= i && i <= 0x23CF
3504 || 0x24EB <= i && i <= 0x24FF
3505 || 0x2596 <= i && i <= 0x259F
3506 || 0x25F8 <= i && i <= 0x25FF
3507 || 0x2672 <= i && i <= 0x2689
3508 || 0x2768 <= i && i <= 0x2775
3509 || 0x27d0 <= i && i <= 0x27ff
3510 || 0x2900 <= i && i <= 0x2aff
3511 || 0x3033 <= i && i <= 0x303F
3512 || 0x31F0 <= i && i <= 0x31FF
3513 || 0x3250 <= i && i <= 0x325F
3514 || 0x32B1 <= i && i <= 0x32BF
3515 || 0x3371 <= i && i <= 0x337B
3516 || 0xFA30 <= i && i <= 0xFA6A
3520 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3522 case UnicodeCategory.PrivateUse:
3523 case UnicodeCategory.Surrogate:
3525 // ignored by nature
3526 case UnicodeCategory.Format:
3527 case UnicodeCategory.OtherNotAssigned:
3534 // To check IsIgnorable sanity, try the driver below under MS.NET.
3537 public static void Main ()
3539 for (int i = 0; i <= char.MaxValue; i++)
3540 Dump (i, IsIgnorable (i));
3543 static void Dump (int i, bool ignore)
3545 switch (Char.GetUnicodeCategory ((char) i)) {
3546 case UnicodeCategory.PrivateUse:
3547 case UnicodeCategory.Surrogate:
3548 return; // check nothing
3552 string s2 = new string ((char) i, 10);
3553 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3554 if ((ret == 0) == ignore)
3556 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3559 #endregion // IsIgnorable
3561 #region IsIgnorableSymbol
3562 static bool IsIgnorableSymbol (int i)
3564 if (IsIgnorable (i))
3569 case 0x00b5: case 0x01C0: case 0x01C1:
3570 case 0x01C2: case 0x01C3: case 0x01F6:
3571 case 0x01F7: case 0x01F8: case 0x01F9:
3572 case 0x02D0: case 0x02EE: case 0x037A:
3573 case 0x03D7: case 0x03F3:
3574 case 0x0400: case 0x040d:
3575 case 0x0450: case 0x045d:
3576 case 0x048C: case 0x048D:
3577 case 0x048E: case 0x048F:
3578 case 0x0587: case 0x0640: case 0x06E5:
3579 case 0x06E6: case 0x06FA: case 0x06FB:
3580 case 0x06FC: case 0x093D: case 0x0950:
3581 case 0x1E9B: case 0x2139: case 0x3006:
3582 case 0x3033: case 0x3034: case 0x3035:
3583 case 0xFE7E: case 0xFE7F:
3585 case 0x16EE: case 0x16EF: case 0x16F0:
3587 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3588 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3589 case 0x3038: // HANGZHOU NUMERAL TEN
3590 case 0x3039: // HANGZHOU NUMERAL TWENTY
3591 case 0x303a: // HANGZHOU NUMERAL THIRTY
3597 case 0x02B9: case 0x02BA: case 0x02C2:
3598 case 0x02C3: case 0x02C4: case 0x02C5:
3599 case 0x02C8: case 0x02CC: case 0x02CD:
3600 case 0x02CE: case 0x02CF: case 0x02D2:
3601 case 0x02D3: case 0x02D4: case 0x02D5:
3602 case 0x02D6: case 0x02D7: case 0x02DE:
3603 case 0x02E5: case 0x02E6: case 0x02E7:
3604 case 0x02E8: case 0x02E9:
3605 case 0x309B: case 0x309C:
3607 case 0x055A: // American Apos
3608 case 0x05C0: // Hebrew Punct
3609 case 0x0E4F: // Thai FONGMAN
3610 case 0x0E5A: // Thai ANGKHANKHU
3611 case 0x0E5B: // Thai KHOMUT
3613 case 0x09F2: // Bengali Rupee Mark
3614 case 0x09F3: // Bengali Rupee Sign
3616 case 0x221e: // INF.
3625 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3627 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3628 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3633 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3635 case UnicodeCategory.Surrogate:
3636 return false; // inconsistent
3638 case UnicodeCategory.SpacingCombiningMark:
3639 case UnicodeCategory.EnclosingMark:
3640 case UnicodeCategory.NonSpacingMark:
3641 case UnicodeCategory.PrivateUse:
3643 if (0x064B <= i && i <= 0x0652) // Arabic
3647 case UnicodeCategory.Format:
3648 case UnicodeCategory.OtherNotAssigned:
3655 // latin in a circle
3656 0x249A <= i && i <= 0x24E9
3657 || 0x2100 <= i && i <= 0x2132
3659 || 0x3196 <= i && i <= 0x31A0
3661 || 0x3200 <= i && i <= 0x321C
3663 || 0x322A <= i && i <= 0x3243
3665 || 0x3260 <= i && i <= 0x32B0
3666 || 0x32D0 <= i && i <= 0x3357
3667 || 0x337B <= i && i <= 0x33DD
3669 use = !Char.IsLetterOrDigit ((char) i);
3673 // This "Digit" rule is mystery.
3674 // It filters some symbols out.
3675 if (Char.IsLetterOrDigit ((char) i))
3677 if (Char.IsNumber ((char) i))
3679 if (Char.IsControl ((char) i)
3680 || Char.IsSeparator ((char) i)
3681 || Char.IsPunctuation ((char) i))
3683 if (Char.IsSymbol ((char) i))
3686 // FIXME: should check more
3691 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3693 public static void Main ()
3695 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3696 for (int i = 0; i <= char.MaxValue; i++) {
3697 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3698 if (uc == UnicodeCategory.Surrogate)
3701 bool ret = IsIgnorableSymbol (i);
3703 string s1 = "TEST ";
3704 string s2 = "TEST " + (char) i;
3706 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3708 if (ret != (result == 0))
3709 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3710 ret ? "should not ignore" :
3719 static bool IsIgnorableNonSpacing (int i)
3721 if (IsIgnorable (i))
3725 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3726 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3727 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3729 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3730 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3731 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3732 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3733 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3734 case 0x0CCD: case 0x0E4E:
3738 if (0x02b9 <= i && i <= 0x02c5
3739 || 0x02cc <= i && i <= 0x02d7
3740 || 0x02e4 <= i && i <= 0x02ef
3741 || 0x20DD <= i && i <= 0x20E0
3745 if (0x064B <= i && i <= 0x00652
3746 || 0x0941 <= i && i <= 0x0948
3747 || 0x0AC1 <= i && i <= 0x0ACD
3748 || 0x0C3E <= i && i <= 0x0C4F
3749 || 0x0E31 <= i && i <= 0x0E3F
3753 return Char.GetUnicodeCategory ((char) i) ==
3754 UnicodeCategory.NonSpacingMark;
3757 // We can reuse IsIgnorableSymbol testcode
3758 // for IsIgnorableNonSpacing.
3764 public byte Category;
3766 public byte Level2; // It is always single byte.
3767 public bool Defined;
3769 public CharMapEntry (byte category, byte level1, byte level2)
3771 Category = category;
3780 public readonly int CP;
3781 public readonly int JIS;
3783 public JISCharacter (int cp, int cpJIS)
3790 class JISComparer : IComparer
3792 public static readonly JISComparer Instance =
3795 public int Compare (object o1, object o2)
3797 JISCharacter j1 = (JISCharacter) o1;
3798 JISCharacter j2 = (JISCharacter) o2;
3799 return j1.JIS - j2.JIS;
3803 class NonJISCharacter
3805 public readonly int CP;
3806 public readonly string Name;
3808 public NonJISCharacter (int cp, string name)
3815 class NonJISComparer : IComparer
3817 public static readonly NonJISComparer Instance =
3818 new NonJISComparer ();
3820 public int Compare (object o1, object o2)
3822 NonJISCharacter j1 = (NonJISCharacter) o1;
3823 NonJISCharacter j2 = (NonJISCharacter) o2;
3824 return string.CompareOrdinal (j1.Name, j2.Name);
3828 class DecimalDictionaryValueComparer : IComparer
3830 public static readonly DecimalDictionaryValueComparer Instance
3831 = new DecimalDictionaryValueComparer ();
3833 private DecimalDictionaryValueComparer ()
3837 public int Compare (object o1, object o2)
3839 DictionaryEntry e1 = (DictionaryEntry) o1;
3840 DictionaryEntry e2 = (DictionaryEntry) o2;
3841 // FIXME: in case of 0, compare decomposition categories
3842 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3845 int i1 = (int) e1.Key;
3846 int i2 = (int) e2.Key;
3851 class StringDictionaryValueComparer : IComparer
3853 public static readonly StringDictionaryValueComparer Instance
3854 = new StringDictionaryValueComparer ();
3856 private StringDictionaryValueComparer ()
3860 public int Compare (object o1, object o2)
3862 DictionaryEntry e1 = (DictionaryEntry) o1;
3863 DictionaryEntry e2 = (DictionaryEntry) o2;
3864 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3867 int i1 = (int) e1.Key;
3868 int i2 = (int) e2.Key;
3873 class UCAComparer : IComparer
3875 public static readonly UCAComparer Instance
3876 = new UCAComparer ();
3878 private UCAComparer ()
3882 public int Compare (object o1, object o2)
3884 char i1 = (char) o1;
3885 char i2 = (char) o2;
3887 int l1 = CollationElementTable.GetSortKeyCount (i1);
3888 int l2 = CollationElementTable.GetSortKeyCount (i2);
3889 int l = l1 > l2 ? l2 : l1;
3891 for (int i = 0; i < l; i++) {
3892 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3893 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3894 int v = k1.Primary - k2.Primary;
3897 v = k1.Secondary - k2.Secondary;
3900 v = k1.Thirtiary - k2.Thirtiary;
3903 v = k1.Quarternary - k2.Quarternary;
3916 ArrayList items = new ArrayList ();
3918 public Tailoring (int lcid)
3923 public Tailoring (int lcid, int alias)
3930 get { return lcid; }
3934 get { return alias; }
3937 public bool FrenchSort {
3938 get { return frenchSort; }
3939 set { frenchSort = value; }
3942 public void AddDiacriticalMap (byte target, byte replace)
3944 items.Add (new DiacriticalMap (target, replace));
3947 public void AddSortKeyMap (string source, byte [] sortkey)
3949 items.Add (new SortKeyMap (source, sortkey));
3952 public void AddReplacementMap (string source, string replace)
3954 items.Add (new ReplacementMap (source, replace));
3957 public char [] ItemToCharArray ()
3959 ArrayList al = new ArrayList ();
3960 foreach (ITailoringMap m in items)
3961 al.AddRange (m.ToCharArray ());
3962 return al.ToArray (typeof (char)) as char [];
3965 interface ITailoringMap
3967 char [] ToCharArray ();
3970 class DiacriticalMap : ITailoringMap
3972 public readonly byte Target;
3973 public readonly byte Replace;
3975 public DiacriticalMap (byte target, byte replace)
3981 public char [] ToCharArray ()
3983 char [] ret = new char [3];
3984 ret [0] = (char) 02; // kind:DiacriticalMap
3985 ret [1] = (char) Target;
3986 ret [2] = (char) Replace;
3991 class SortKeyMap : ITailoringMap
3993 public readonly string Source;
3994 public readonly byte [] SortKey;
3996 public SortKeyMap (string source, byte [] sortkey)
4002 public char [] ToCharArray ()
4004 char [] ret = new char [Source.Length + 7];
4005 ret [0] = (char) 01; // kind:SortKeyMap
4006 for (int i = 0; i < Source.Length; i++)
4007 ret [i + 1] = Source [i];
4009 for (int i = 0; i < 4; i++)
4010 ret [i + Source.Length + 2] = (char) SortKey [i];
4015 class ReplacementMap : ITailoringMap
4017 public readonly string Source;
4018 public readonly string Replace;
4020 public ReplacementMap (string source, string replace)
4026 public char [] ToCharArray ()
4028 char [] ret = new char [Source.Length + Replace.Length + 3];
4029 ret [0] = (char) 03; // kind:ReplaceMap
4031 for (int i = 0; i < Source.Length; i++)
4032 ret [pos++] = Source [i];
4035 for (int i = 0; i < Replace.Length; i++)
4036 ret [pos++] = Replace [i];