3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
35 namespace Mono.Globalization.Unicode
37 internal class MSCompatSortKeyTableGenerator
39 public static void Main (string [] args)
41 new MSCompatSortKeyTableGenerator ().Run (args);
44 const int DecompositionWide = 1; // fixed
45 const int DecompositionSub = 2; // fixed
46 const int DecompositionSmall = 3;
47 const int DecompositionIsolated = 4;
48 const int DecompositionInitial = 5;
49 const int DecompositionFinal = 6;
50 const int DecompositionMedial = 7;
51 const int DecompositionNoBreak = 8;
52 const int DecompositionVertical = 9;
53 const int DecompositionFraction = 0xA;
54 const int DecompositionFont = 0xB;
55 const int DecompositionSuper = 0xC; // fixed
56 const int DecompositionFull = 0xE;
57 const int DecompositionNarrow = 0xD;
58 const int DecompositionCircle = 0xF;
59 const int DecompositionSquare = 0x10;
60 const int DecompositionCompat = 0x11;
61 const int DecompositionCanonical = 0x12;
63 TextWriter Result = Console.Out;
65 byte [] fillIndex = new byte [256]; // by category
66 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68 char [] specialIgnore = new char [] {
69 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
73 // FIXME: need more love (as always)
74 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77 '\u0292', '\u01BE', '\u0298'};
78 byte [] alphaWeights = new byte [] {
79 2, 9, 0xA, 0x1A, 0x21,
80 0x23, 0x25, 0x2C, 0x32, 0x35,
81 0x36, 0x48, 0x51, 0x70, 0x7C,
82 0x7E, 0x89, 0x8A, 0x91, 0x99,
83 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84 0xA9, 0xAA, 0xB3, 0xB4};
86 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87 bool [] isUppercase = new bool [char.MaxValue + 1];
89 byte [] decompType = new byte [char.MaxValue + 1];
90 int [] decompIndex = new int [char.MaxValue + 1];
91 int [] decompLength = new int [char.MaxValue + 1];
93 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95 byte [] diacritical = new byte [char.MaxValue + 1];
97 string [] diacritics = new string [] {
99 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102 " OGONEK;", " CEDILLA;",
103 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104 " STROKE;", " CIRCUMFLEX AND ACUTE;",
105 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106 " DIAERESIS AND GRAVE;",
108 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109 " MACRON AND ACUTE;",
110 " MACRON AND GRAVE;",
111 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112 " RING ABOVE AND ACUTE",
113 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114 " CIRCUMFLEX AND TILDE",
115 " TILDE AND DIAERESIS",
118 " CEDILLA AND BREVE",
119 " OGONEK AND MACRON",
120 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
123 " PRECEDED BY APOSTROPHE",
125 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
128 " RETROFLEX;", "DIAERESIS BELOW",
130 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131 " BREVE BELOW;", " HORN AND GRAVE",
133 " DOT BELOW AND DOT ABOVE",
134 " RIGHT HALF RING", " HORN AND TILDE",
135 " CIRCUMFLEX AND DOT BELOW",
136 " BREVE AND DOT BELOW",
137 " DOT BELOW AND MACRON",
138 " HORN AND HOOK ABOVE",
140 // CIRCLED, PARENTHESIZED and so on
141 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
144 byte [] diacriticWeights = new byte [] {
146 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147 0x17, 0x19, 0x1A, 0x1B, 0x1C,
148 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
155 0x69, 0x69, 0x6A, 0x6D, 0x6E,
157 // CIRCLED, PARENTHESIZED and so on.
158 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
161 int [] numberSecondaryWeightBounds = new int [] {
162 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166 0xE50, 0xE60, 0xED0, 0xEE0
169 char [] orderedCyrillic;
170 char [] orderedGurmukhi;
171 char [] orderedGujarati;
172 char [] orderedGeorgian;
173 char [] orderedThaana;
175 static readonly char [] orderedTamilConsonants = new char [] {
176 // based on traditional Tamil consonants, except for
177 // Grantha (where Microsoft breaks traditionalism).
178 // http://www.angelfire.com/empire/thamizh/padanGaL
179 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
185 // cp -> character name (only for some characters)
186 ArrayList sortableCharNames = new ArrayList ();
188 // cp -> arrow value (int)
189 ArrayList arrowValues = new ArrayList ();
191 // cp -> box value (int)
192 ArrayList boxValues = new ArrayList ();
194 // cp -> level1 value
195 Hashtable arabicLetterPrimaryValues = new Hashtable ();
196 Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
199 Hashtable arabicNameMap = new Hashtable ();
200 Hashtable cyrillicNameMap = new Hashtable ();
202 // cp -> Hashtable [decompType] -> cp
203 Hashtable nfkdMap = new Hashtable ();
205 // Latin letter -> ArrayList [int]
206 Hashtable latinMap = new Hashtable ();
208 ArrayList jisJapanese = new ArrayList ();
209 ArrayList nonJisJapanese = new ArrayList ();
211 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
212 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
213 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
214 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
215 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
217 byte [] ignorableFlags = new byte [char.MaxValue + 1];
219 static double [] unicodeAge = new double [char.MaxValue + 1];
221 ArrayList tailorings = new ArrayList ();
223 void Run (string [] args)
225 string dirname = args.Length == 0 ? "downloaded" : args [0];
226 ParseSources (dirname);
227 Console.Error.WriteLine ("parse done.");
229 ModifyParsedValues ();
231 Console.Error.WriteLine ("generation done.");
233 Console.Error.WriteLine ("serialization done.");
235 StreamWriter sw = new StreamWriter ("agelog.txt");
236 for (int i = 0; i < char.MaxValue; i++) {
237 bool shouldBe = false;
238 switch (Char.GetUnicodeCategory ((char) i)) {
239 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
240 shouldBe = true; break;
242 if (unicodeAge [i] >= 3.1)
244 //if (IsIgnorable (i) != shouldBe)
245 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
251 byte [] CompressArray (byte [] source, CodePointIndexer i)
253 return (byte []) CodePointIndexer.CompressArray (
254 source, typeof (byte), i);
257 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
259 return (ushort []) CodePointIndexer.CompressArray (
260 source, typeof (ushort), i);
266 SerializeTailorings ();
268 byte [] categories = new byte [map.Length];
269 byte [] level1 = new byte [map.Length];
270 byte [] level2 = new byte [map.Length];
271 byte [] level3 = new byte [map.Length];
272 int [] widthCompat = new int [map.Length];
273 for (int i = 0; i < map.Length; i++) {
274 categories [i] = map [i].Category;
275 level1 [i] = map [i].Level1;
276 level2 [i] = map [i].Level2;
277 level3 [i] = ComputeLevel3Weight ((char) i);
278 switch (decompType [i]) {
279 case DecompositionNarrow:
280 case DecompositionWide:
281 case DecompositionSuper:
282 case DecompositionSub:
283 // they are always 1 char
284 widthCompat [i] = decompValues [decompIndex [i]];
290 ignorableFlags = CompressArray (ignorableFlags,
291 MSCompatUnicodeTableUtil.Ignorable);
292 categories = CompressArray (categories,
293 MSCompatUnicodeTableUtil.Category);
294 level1 = CompressArray (level1,
295 MSCompatUnicodeTableUtil.Level1);
296 level2 = CompressArray (level2,
297 MSCompatUnicodeTableUtil.Level2);
298 level3 = CompressArray (level3,
299 MSCompatUnicodeTableUtil.Level3);
300 widthCompat = (int []) CodePointIndexer.CompressArray (
301 widthCompat, typeof (int),
302 MSCompatUnicodeTableUtil.WidthCompat);
303 cjkCHS = CompressArray (cjkCHS,
304 MSCompatUnicodeTableUtil.CjkCHS);
305 cjkCHT = CompressArray (cjkCHT,
306 MSCompatUnicodeTableUtil.Cjk);
307 cjkJA = CompressArray (cjkJA,
308 MSCompatUnicodeTableUtil.Cjk);
309 cjkKO = CompressArray (cjkKO,
310 MSCompatUnicodeTableUtil.Cjk);
311 cjkKOlv2 = CompressArray (cjkKOlv2,
312 MSCompatUnicodeTableUtil.Cjk);
315 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
316 for (int i = 0; i < ignorableFlags.Length; i++) {
317 byte value = ignorableFlags [i];
319 Result.Write ("{0},", value);
321 Result.Write ("0x{0:X02},", value);
322 if ((i & 0xF) == 0xF)
323 Result.WriteLine ("// {0:X04}", i - 0xF);
325 Result.WriteLine ("};");
329 Result.WriteLine ("static byte [] categories = new byte [] {");
330 for (int i = 0; i < categories.Length; i++) {
331 byte value = categories [i];
333 Result.Write ("{0},", value);
335 Result.Write ("0x{0:X02},", value);
336 if ((i & 0xF) == 0xF)
337 Result.WriteLine ("// {0:X04}", i - 0xF);
339 Result.WriteLine ("};");
342 // Primary weight value
343 Result.WriteLine ("static byte [] level1 = new byte [] {");
344 for (int i = 0; i < level1.Length; i++) {
345 byte value = level1 [i];
347 Result.Write ("{0},", value);
349 Result.Write ("0x{0:X02},", value);
350 if ((i & 0xF) == 0xF)
351 Result.WriteLine ("// {0:X04}", i - 0xF);
353 Result.WriteLine ("};");
357 Result.WriteLine ("static byte [] level2 = new byte [] {");
358 for (int i = 0; i < level2.Length; i++) {
359 int value = level2 [i];
361 Result.Write ("{0},", value);
363 Result.Write ("0x{0:X02},", value);
364 if ((i & 0xF) == 0xF)
365 Result.WriteLine ("// {0:X04}", i - 0xF);
367 Result.WriteLine ("};");
371 Result.WriteLine ("static byte [] level3 = new byte [] {");
372 for (int i = 0; i < level3.Length; i++) {
373 byte value = level3 [i];
375 Result.Write ("{0},", value);
377 Result.Write ("0x{0:X02},", value);
378 if ((i & 0xF) == 0xF)
379 Result.WriteLine ("// {0:X04}", i - 0xF);
381 Result.WriteLine ("};");
384 // Width insensitivity mappings
385 // (for now it is more lightweight than dumping the
386 // entire NFKD table).
387 Result.WriteLine ("static int [] widthCompat = new int [] {");
388 for (int i = 0; i < widthCompat.Length; i++) {
389 int value = widthCompat [i];
391 Result.Write ("{0},", value);
393 Result.Write ("0x{0:X02},", value);
394 if ((i & 0xF) == 0xF)
395 Result.WriteLine ("// {0:X04}", i - 0xF);
397 Result.WriteLine ("};");
401 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
402 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
403 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
404 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
405 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
408 void SerializeCJK (string name, ushort [] cjk, int max)
410 int offset = 0;//char.MaxValue - cjk.Length;
411 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
412 for (int i = 0; i < cjk.Length; i++) {
413 if (i + offset == max)
415 ushort value = cjk [i];
417 Result.Write ("{0},", value);
419 Result.Write ("0x{0:X04},", value);
420 if ((i & 0xF) == 0xF)
421 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
423 Result.WriteLine ("};");
427 void SerializeCJK (string name, byte [] cjk, int max)
429 int offset = 0;//char.MaxValue - cjk.Length;
430 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
431 for (int i = 0; i < cjk.Length; i++) {
432 if (i + offset == max)
434 byte value = cjk [i];
436 Result.Write ("{0},", value);
438 Result.Write ("0x{0:X02},", value);
439 if ((i & 0xF) == 0xF)
440 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
442 Result.WriteLine ("};");
446 void SerializeTailorings ()
448 Hashtable indexes = new Hashtable ();
449 Hashtable counts = new Hashtable ();
450 Result.WriteLine ("static char [] tailorings = new char [] {");
452 foreach (Tailoring t in tailorings) {
455 Result.Write ("/*{0}*/", t.LCID);
456 indexes.Add (t.LCID, count);
457 char [] values = t.ItemToCharArray ();
458 counts.Add (t.LCID, values.Length);
459 foreach (char c in values) {
460 Result.Write ("'\\x{0:X}', ", (int) c);
461 if (++count % 16 == 0)
462 Result.WriteLine (" // {0:X04}", count - 16);
465 Result.WriteLine ("};");
467 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
468 foreach (Tailoring t in tailorings) {
469 int target = t.Alias != 0 ? t.Alias : t.LCID;
470 if (!indexes.ContainsKey (target)) {
471 Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
474 int idx = (int) indexes [target];
475 int cnt = (int) counts [target];
476 bool french = t.FrenchSort;
478 foreach (Tailoring t2 in tailorings)
479 if (t2.LCID == t.LCID)
480 french = t2.FrenchSort;
481 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
483 Result.WriteLine ("};");
488 void ParseSources (string dirname)
491 dirname + "/UnicodeData.txt";
492 string derivedCoreProps =
493 dirname + "/DerivedCoreProperties.txt";
495 dirname + "/Scripts.txt";
497 dirname + "/CP932.TXT";
499 dirname + "/DerivedAge.txt";
500 string chXML = dirname + "/common/collation/zh.xml";
501 string jaXML = dirname + "/common/collation/ja.xml";
502 string koXML = dirname + "/common/collation/ko.xml";
504 ParseDerivedAge (derivedAge);
508 ParseJISOrder (cp932); // in prior to ParseUnidata()
509 ParseUnidata (unidata);
510 ParseDerivedCoreProperties (derivedCoreProps);
511 ParseScripts (scripts);
512 ParseCJK (chXML, jaXML, koXML);
514 ParseTailorings ("mono-tailoring-source.txt");
517 void ParseTailorings (string filename)
521 using (StreamReader sr = new StreamReader (filename)) {
523 while (sr.Peek () >= 0) {
525 ProcessTailoringLine (ref t,
526 sr.ReadLine ().Trim ());
528 } catch (Exception) {
529 Console.Error.WriteLine ("ERROR at line {0}", line);
535 // For now this is enough.
536 string ParseTailoringSourceValue (string s)
538 StringBuilder sb = new StringBuilder ();
539 for (int i = 0; i < s.Length; i++) {
540 if (s.StartsWith ("\\u")) {
541 sb.Append ((char) int.Parse (
542 s.Substring (2, 4), NumberStyles.HexNumber),
549 return sb.ToString ();
552 void ProcessTailoringLine (ref Tailoring t, string s)
554 int idx = s.IndexOf ('#');
556 s = s.Substring (0, idx).Trim ();
557 if (s.Length == 0 || s [0] == '#')
560 idx = s.IndexOf ('=');
563 int.Parse (s.Substring (1, idx - 1)),
564 int.Parse (s.Substring (idx + 1)));
566 t = new Tailoring (int.Parse (s.Substring (1)));
570 if (s.StartsWith ("*FrenchSort")) {
574 string d = "*Diacritical";
575 if (s.StartsWith (d)) {
576 idx = s.IndexOf ("->");
577 t.AddDiacriticalMap (
578 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
579 NumberStyles.HexNumber),
580 byte.Parse (s.Substring (idx + 2).Trim (),
581 NumberStyles.HexNumber));
584 idx = s.IndexOf (':');
586 string source = s.Substring (0, idx).Trim ();
587 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
588 byte [] b = new byte [5];
589 for (int i = 0; i < 5; i++) {
593 b [i] = byte.Parse (l [i],
594 NumberStyles.HexNumber);
596 t.AddSortKeyMap (ParseTailoringSourceValue (source),
599 idx = s.IndexOf ('=');
601 t.AddReplacementMap (
602 ParseTailoringSourceValue (
603 s.Substring (0, idx).Trim ()),
604 ParseTailoringSourceValue (
605 s.Substring (idx + 1).Trim ()));
608 void ParseDerivedAge (string filename)
610 using (StreamReader file =
611 new StreamReader (filename)) {
612 while (file.Peek () >= 0) {
613 string s = file.ReadLine ();
614 int idx = s.IndexOf ('#');
616 s = s.Substring (0, idx);
617 idx = s.IndexOf (';');
621 string cpspec = s.Substring (0, idx);
622 idx = cpspec.IndexOf ("..");
623 NumberStyles nf = NumberStyles.HexNumber |
624 NumberStyles.AllowTrailingWhite;
625 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
626 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
627 string value = s.Substring (cpspec.Length + 1).Trim ();
630 if (cp > char.MaxValue)
633 double v = double.Parse (value);
634 for (int i = cp; i <= cpEnd; i++)
638 unicodeAge [0] = double.MaxValue; // never be supported
641 void ParseUnidata (string filename)
643 ArrayList decompValues = new ArrayList ();
644 using (StreamReader unidata =
645 new StreamReader (filename)) {
646 for (int line = 1; unidata.Peek () >= 0; line++) {
648 ProcessUnidataLine (unidata.ReadLine (), decompValues);
649 } catch (Exception) {
650 Console.Error.WriteLine ("**** At line " + line);
655 this.decompValues = (int [])
656 decompValues.ToArray (typeof (int));
659 void ProcessUnidataLine (string s, ArrayList decompValues)
661 int idx = s.IndexOf ('#');
663 s = s.Substring (0, idx);
664 idx = s.IndexOf (';');
667 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
668 string [] values = s.Substring (idx + 1).Split (';');
671 if (cp > char.MaxValue)
673 if (IsIgnorable (cp))
676 string name = values [0];
679 if (s.IndexOf ("SMALL CAPITAL") > 0)
680 isSmallCapital [cp] = true;
682 // latin mapping by character name
683 if (s.IndexOf ("LATIN") > 0) {
684 int lidx = s.IndexOf ("LETTER DOTLESS ");
685 int offset = lidx + 15;
687 lidx = s.IndexOf ("LETTER TURNED ");
691 lidx = s.IndexOf ("LETTER ");
694 char c = lidx > 0 ? s [offset] : char.MinValue;
695 if ('A' <= c && c <= 'Z' &&
696 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
697 ArrayList entry = (ArrayList) latinMap [c];
699 entry = new ArrayList ();
700 latinMap [c] = entry;
707 if (0x2000 <= cp && cp < 0x3000) {
709 // SPECIAL CASES. FIXME: why?
711 case 0x21C5: value = -1; break; // E2
712 case 0x261D: value = 1; break;
713 case 0x27A6: value = 3; break;
714 case 0x21B0: value = 7; break;
715 case 0x21B1: value = 3; break;
716 case 0x21B2: value = 7; break;
717 case 0x21B4: value = 5; break;
718 case 0x21B5: value = 7; break;
719 case 0x21B9: value = -1; break; // E1
720 case 0x21CF: value = 7; break;
721 case 0x21D0: value = 3; break;
723 string [] arrowTargets = new string [] {
735 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
736 if (s.IndexOf (arrowTargets [i]) > 0 &&
737 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
738 s.IndexOf (" OVER") < 0
742 arrowValues.Add (new DictionaryEntry (
747 if (0x2500 <= cp && cp < 0x25B0) {
750 // up:1 down:2 right:4 left:8 vert:16 horiz:32
753 // [dr] [dl] [ur] [ul]
757 ArrayList flags = new ArrayList (new int [] {
760 4 + 2, 8 + 2, 4 + 1, 8 + 1,
761 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
762 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
763 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
765 byte [] offsets = new byte [] {
772 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
774 if (s.IndexOf (" UP") > 0)
776 if (s.IndexOf (" DOWN") > 0)
778 if (s.IndexOf (" RIGHT") > 0)
780 if (s.IndexOf (" LEFT") > 0)
782 if (s.IndexOf (" VERTICAL") > 0)
784 if (s.IndexOf (" HORIZONTAL") > 0)
787 int fidx = flags.IndexOf (flag);
788 value = fidx < 0 ? fidx : offsets [fidx];
789 } else if (s.IndexOf ("BLOCK") > 0) {
790 if (s.IndexOf ("ONE EIGHTH") > 0)
792 else if (s.IndexOf ("ONE QUARTER") > 0)
794 else if (s.IndexOf ("THREE EIGHTHS") > 0)
796 else if (s.IndexOf ("HALF") > 0)
798 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
800 else if (s.IndexOf ("THREE QUARTERS") > 0)
802 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
808 boxValues.Add (new DictionaryEntry (
812 // For some characters store the name and sort later
813 // to determine sorting.
814 if (0x2100 <= cp && cp <= 0x213F &&
815 Char.IsSymbol ((char) cp))
816 sortableCharNames.Add (
817 new DictionaryEntry (cp, values [0]));
818 else if (0x3380 <= cp && cp <= 0x33DD)
819 sortableCharNames.Add (new DictionaryEntry (
820 cp, values [0].Substring (7)));
822 // diacritical weights by character name
823 for (int d = 0; d < diacritics.Length; d++)
824 if (s.IndexOf (diacritics [d]) > 0)
825 diacritical [cp] |= diacriticWeights [d];
826 // Two-step grep required for it.
827 if (s.IndexOf ("FULL STOP") > 0 &&
828 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
829 diacritical [cp] |= 0xF4;
831 // Cyrillic letter name
832 if (0x0430 <= cp && cp <= 0x0486 &&
833 Char.IsLetter ((char) cp)) {
834 byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
835 // Get primary letter name i.e.
836 // XXX part of CYRILLIC LETTER XXX yyy
837 // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
839 values [0].Substring (values [0].IndexOf ("LETTER ") + 7);
840 int tmpIdx = letterName.IndexOf (' ');
841 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
842 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
843 if (cyrillicNameMap.ContainsKey (letterName))
844 value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
846 cyrillicNameMap [letterName] = cp;
848 cyrillicLetterPrimaryValues [cp] = value;
851 // Arabic letter name
852 if (0x0621 <= cp && cp <= 0x064A &&
853 Char.GetUnicodeCategory ((char) cp)
854 == UnicodeCategory.OtherLetter) {
855 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
860 // hamza, waw, yeh ... special cases.
865 value = 0x77; // special cases.
868 // Get primary letter name i.e.
869 // XXX part of ARABIC LETTER XXX yyy
870 // e.g. that of "TEH MARBUTA" is "TEH".
873 // 0x0640 is special: it does
874 // not start with ARABIC LETTER
876 values [0].Substring (14);
877 int tmpIdx = letterName.IndexOf (' ');
878 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
879 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
880 if (arabicNameMap.ContainsKey (letterName))
881 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
883 arabicNameMap [letterName] = cp;
886 arabicLetterPrimaryValues [cp] = value;
889 // Japanese square letter
890 if (0x3300 <= cp && cp <= 0x3357)
892 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
895 string decomp = values [4];
896 idx = decomp.IndexOf ('<');
898 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
900 decompType [cp] = DecompositionFull;
903 decompType [cp] = DecompositionSub;
906 decompType [cp] = DecompositionSuper;
909 decompType [cp] = DecompositionSmall;
912 decompType [cp] = DecompositionIsolated;
915 decompType [cp] = DecompositionInitial;
918 decompType [cp] = DecompositionFinal;
921 decompType [cp] = DecompositionMedial;
924 decompType [cp] = DecompositionNoBreak;
927 decompType [cp] = DecompositionCompat;
930 decompType [cp] = DecompositionFraction;
933 decompType [cp] = DecompositionFont;
936 decompType [cp] = DecompositionCircle;
939 decompType [cp] = DecompositionSquare;
942 decompType [cp] = DecompositionWide;
945 decompType [cp] = DecompositionNarrow;
948 decompType [cp] = DecompositionVertical;
951 throw new Exception ("Support NFKD type : " + decomp);
955 decompType [cp] = DecompositionCanonical;
956 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
957 if (decomp.Length > 0) {
959 string [] velems = decomp.Split (' ');
960 int didx = decompValues.Count;
961 decompIndex [cp] = didx;
962 foreach (string v in velems)
963 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
964 decompLength [cp] = velems.Length;
966 // [decmpType] -> this_cp
967 int targetCP = (int) decompValues [didx];
968 // for "(x)" it specially maps to 'x' .
969 // FIXME: check if it is sane
970 if (velems.Length == 3 &&
971 (int) decompValues [didx] == '(' &&
972 (int) decompValues [didx + 2] == ')')
973 targetCP = (int) decompValues [didx + 1];
974 // special: 0x215F "1/"
975 else if (cp == 0x215F)
977 else if (velems.Length > 1 &&
978 (targetCP < 0x4C00 || 0x9FBB < targetCP))
979 // skip them, except for CJK ideograph compat
983 Hashtable entry = (Hashtable) nfkdMap [targetCP];
985 entry = new Hashtable ();
986 nfkdMap [targetCP] = entry;
988 entry [(byte) decompType [cp]] = cp;
992 if (values [5].Length > 0)
993 decimalValue [cp] = decimal.Parse (values [5]);
994 else if (values [6].Length > 0)
995 decimalValue [cp] = decimal.Parse (values [6]);
996 else if (values [7].Length > 0) {
997 string decstr = values [7];
998 idx = decstr.IndexOf ('/');
999 if (cp == 0x215F) // special. "1/"
1000 decimalValue [cp] = 0x1;
1004 decimal.Parse (decstr.Substring (0, idx))
1005 / decimal.Parse (decstr.Substring (idx + 1));
1006 else if (decstr [0] == '(' &&
1007 decstr [decstr.Length - 1] == ')')
1010 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1011 else if (decstr [decstr.Length - 1] == '.')
1014 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1016 decimalValue [cp] = decimal.Parse (decstr);
1020 void ParseDerivedCoreProperties (string filename)
1023 using (StreamReader file =
1024 new StreamReader (filename)) {
1025 for (int line = 1; file.Peek () >= 0; line++) {
1027 ProcessDerivedCorePropLine (file.ReadLine ());
1028 } catch (Exception) {
1029 Console.Error.WriteLine ("**** At line " + line);
1036 void ProcessDerivedCorePropLine (string s)
1038 int idx = s.IndexOf ('#');
1040 s = s.Substring (0, idx);
1041 idx = s.IndexOf (';');
1044 string cpspec = s.Substring (0, idx);
1045 idx = cpspec.IndexOf ("..");
1046 NumberStyles nf = NumberStyles.HexNumber |
1047 NumberStyles.AllowTrailingWhite;
1048 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1049 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1050 string value = s.Substring (cpspec.Length + 1).Trim ();
1053 if (cp > char.MaxValue)
1058 for (int x = cp; x <= cpEnd; x++)
1059 isUppercase [x] = true;
1064 void ParseScripts (string filename)
1066 ArrayList cyrillic = new ArrayList ();
1067 ArrayList gurmukhi = new ArrayList ();
1068 ArrayList gujarati = new ArrayList ();
1069 ArrayList georgian = new ArrayList ();
1070 ArrayList thaana = new ArrayList ();
1072 using (StreamReader file =
1073 new StreamReader (filename)) {
1074 while (file.Peek () >= 0) {
1075 string s = file.ReadLine ();
1076 int idx = s.IndexOf ('#');
1078 s = s.Substring (0, idx);
1079 idx = s.IndexOf (';');
1083 string cpspec = s.Substring (0, idx);
1084 idx = cpspec.IndexOf ("..");
1085 NumberStyles nf = NumberStyles.HexNumber |
1086 NumberStyles.AllowTrailingWhite;
1087 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1088 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1089 string value = s.Substring (cpspec.Length + 1).Trim ();
1092 if (cp > char.MaxValue)
1097 for (int x = cp; x <= cpEnd; x++)
1098 if (!IsIgnorable (x))
1099 cyrillic.Add ((char) x);
1102 for (int x = cp; x <= cpEnd; x++)
1103 if (!IsIgnorable (x))
1104 gurmukhi.Add ((char) x);
1107 for (int x = cp; x <= cpEnd; x++)
1108 if (!IsIgnorable (x))
1109 gujarati.Add ((char) x);
1112 for (int x = cp; x <= cpEnd; x++)
1113 if (!IsIgnorable (x))
1114 georgian.Add ((char) x);
1117 for (int x = cp; x <= cpEnd; x++)
1118 if (!IsIgnorable (x))
1119 thaana.Add ((char) x);
1124 cyrillic.Sort (UCAComparer.Instance);
1125 gurmukhi.Sort (UCAComparer.Instance);
1126 gujarati.Sort (UCAComparer.Instance);
1127 georgian.Sort (UCAComparer.Instance);
1128 thaana.Sort (UCAComparer.Instance);
1129 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1130 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1131 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1132 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1133 orderedThaana = (char []) thaana.ToArray (typeof (char));
1136 void ParseJISOrder (string filename)
1138 using (StreamReader file =
1139 new StreamReader (filename)) {
1140 while (file.Peek () >= 0) {
1141 string s = file.ReadLine ();
1142 int idx = s.IndexOf ('#');
1144 s = s.Substring (0, idx).Trim ();
1147 idx = s.IndexOf (' ');
1150 // They start with "0x" so cut them out.
1151 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1152 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1153 jisJapanese.Add (new JISCharacter (cp, jis));
1158 void ParseCJK (string zhXML, string jaXML, string koXML)
1160 XmlDocument doc = new XmlDocument ();
1161 doc.XmlResolver = null;
1168 // Chinese Simplified
1171 offset = 0;//char.MaxValue - arr.Length;
1173 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1175 foreach (char c in s) {
1177 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1179 arr [(int) c - offset] = (ushort) v++;
1185 // Chinese Traditional
1188 offset = 0;//char.MaxValue - arr.Length;
1189 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1191 foreach (char c in s) {
1193 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1195 arr [(int) c - offset] = (ushort) v++;
1204 offset = 0;//char.MaxValue - arr.Length;
1206 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1208 foreach (char c in s) {
1210 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1212 arr [(int) c - offset] = (ushort) v++;
1219 // Korean weight is somewhat complex. It first shifts
1220 // Hangul category from 52-x to 80-x (they are anyways
1221 // computed). CJK ideographs are placed at secondary
1222 // weight, like XX YY 01 zz 01, where XX and YY are
1223 // corresponding "reset" value and zz is 41,43,45...
1225 // Unlike chs,cht and ja, Korean value is a combined
1226 // ushort which is computed as category
1230 offset = 0;//char.MaxValue - arr.Length;
1232 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1233 XmlElement sc = (XmlElement) reset.NextSibling;
1234 // compute "category" and "level 1" for the
1235 // target "reset" Hangle syllable
1236 char rc = reset.InnerText [0];
1237 int ri = ((int) rc - 0xAC00) + 1;
1239 ((ri / 254) * 256 + (ri % 254) + 2);
1240 // Place the characters after the target.
1243 foreach (char c in s) {
1244 arr [(int) c - offset] = p;
1245 cjkKOlv2 [(int) c - offset] = (byte) v;
1255 void FillIgnorables ()
1257 for (int i = 0; i <= char.MaxValue; i++) {
1258 if (Char.GetUnicodeCategory ((char) i) ==
1259 UnicodeCategory.OtherNotAssigned)
1261 if (IsIgnorable (i))
1262 ignorableFlags [i] |= 1;
1263 if (IsIgnorableSymbol (i))
1264 ignorableFlags [i] |= 2;
1265 if (IsIgnorableNonSpacing (i))
1266 ignorableFlags [i] |= 4;
1270 void ModifyParsedValues ()
1272 // number, secondary weights
1274 int [] numarr = numberSecondaryWeightBounds;
1275 for (int i = 0; i < numarr.Length; i += 2, weight++)
1276 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1277 if (Char.IsNumber ((char) cp))
1278 diacritical [cp] = weight;
1280 // Modify some decomposition equivalence
1281 decompType [0xFE31] = 0;
1282 decompIndex [0xFE31] = 0;
1283 decompLength [0xFE31] = 0;
1284 decompType [0xFE32] = 0;
1285 decompIndex [0xFE32] = 0;
1286 decompLength [0xFE32] = 0;
1288 // Korean parens numbers
1289 for (int i = 0x3200; i <= 0x321C; i++)
1290 diacritical [i] = 0xA;
1291 for (int i = 0x3260; i <= 0x327B; i++)
1292 diacritical [i] = 0xC;
1294 // Update name part of named characters
1295 for (int i = 0; i < sortableCharNames.Count; i++) {
1296 DictionaryEntry de =
1297 (DictionaryEntry) sortableCharNames [i];
1298 int cp = (int) de.Key;
1299 string renamed = null;
1301 case 0x2101: renamed = "A_1"; break;
1302 case 0x33C3: renamed = "A_2"; break;
1303 case 0x2105: renamed = "C_1"; break;
1304 case 0x2106: renamed = "C_2"; break;
1305 case 0x211E: renamed = "R1"; break;
1306 case 0x211F: renamed = "R2"; break;
1307 // Remove some of them!
1318 sortableCharNames.RemoveAt (i);
1322 if (renamed != null)
1323 sortableCharNames [i] =
1324 new DictionaryEntry (cp, renamed);
1328 void GenerateCore ()
1332 #region Specially ignored // 01
1333 // This will raise "Defined" flag up.
1334 foreach (char c in specialIgnore)
1335 map [(int) c] = new CharMapEntry (0, 0, 0);
1339 #region Variable weights
1340 // Controls : 06 03 - 06 3D
1342 for (int i = 0; i < 65536; i++) {
1343 if (IsIgnorable (i))
1346 uc = Char.GetUnicodeCategory (c);
1347 // NEL is whitespace but not ignored here.
1348 if (uc == UnicodeCategory.Control &&
1349 !Char.IsWhiteSpace (c) || c == '\u0085')
1350 AddCharMap (c, 6, 1);
1354 fillIndex [6] = 0x80;
1355 AddCharMapGroup ('\'', 6, 1, 0);
1356 AddCharMap ('\uFE63', 6, 1);
1358 // Hyphen/Dash : 06 81 - 06 90
1359 for (int i = 0; i < char.MaxValue; i++) {
1360 if (!IsIgnorable (i) &&
1361 Char.GetUnicodeCategory ((char) i) ==
1362 UnicodeCategory.DashPunctuation) {
1363 AddCharMapGroup2 ((char) i, 6, 1, 0);
1365 // SPECIAL: add 2027 and 2043
1366 // Maybe they are regarded the
1367 // same hyphens in "central"
1369 AddCharMap ('\u2027', 6, 1);
1370 AddCharMap ('\u2043', 6, 1);
1375 // Arabic variable weight chars 06 A0 -
1376 fillIndex [6] = 0xA0;
1378 for (int i = 0x64B; i <= 0x650; i++)
1379 AddArabicCharMap ((char) i);
1381 AddCharMapGroup ('\u0652', 6, 1, 0);
1383 AddCharMapGroup ('\u0651', 6, 1, 0);
1387 #region Nonspacing marks // 01
1388 // FIXME: 01 03 - 01 B6 ... annoyance :(
1390 // Combining diacritical marks: 01 DC -
1392 fillIndex [0x1] = 0x41;
1393 for (int i = 0x030E; i <= 0x0326; i++)
1394 if (!IsIgnorable (i))
1395 AddCharMap ((char) i, 0x1, 1);
1396 for (int i = 0x0329; i <= 0x0334; i++)
1397 if (!IsIgnorable (i))
1398 AddCharMap ((char) i, 0x1, 1);
1399 for (int i = 0x0339; i <= 0x0341; i++)
1400 if (!IsIgnorable (i))
1401 AddCharMap ((char) i, 0x1, 1);
1402 fillIndex [0x1] = 0x72;
1403 for (int i = 0x0346; i <= 0x0348; i++)
1404 if (!IsIgnorable (i))
1405 AddCharMap ((char) i, 0x1, 1);
1406 for (int i = 0x02BE; i <= 0x02BF; i++)
1407 if (!IsIgnorable (i))
1408 AddCharMap ((char) i, 0x1, 1);
1409 for (int i = 0x02C1; i <= 0x02C5; i++)
1410 if (!IsIgnorable (i))
1411 AddCharMap ((char) i, 0x1, 1);
1412 for (int i = 0x02CE; i <= 0x02CF; i++)
1413 if (!IsIgnorable (i))
1414 AddCharMap ((char) i, 0x1, 1);
1415 for (int i = 0x02D1; i <= 0x02D3; i++)
1416 if (!IsIgnorable (i))
1417 AddCharMap ((char) i, 0x1, 1);
1418 AddCharMap ('\u02DE', 0x1, 1);
1419 for (int i = 0x02E4; i <= 0x02E9; i++)
1420 if (!IsIgnorable (i))
1421 AddCharMap ((char) i, 0x1, 1);
1423 // LAMESPEC: It should not stop at '\u20E1'. There are
1424 // a few more characters (that however results in
1425 // overflow of level 2 unless we start before 0xDD).
1426 fillIndex [0x1] = 0xDC;
1427 for (int i = 0x20d0; i <= 0x20e1; i++)
1428 AddCharMap ((char) i, 0x1, 1);
1432 #region Whitespaces // 07 03 -
1433 fillIndex [0x7] = 0x2;
1434 AddCharMap (' ', 0x7, 2);
1435 AddCharMap ('\u00A0', 0x7, 1);
1436 for (int i = 9; i <= 0xD; i++)
1437 AddCharMap ((char) i, 0x7, 1);
1438 for (int i = 0x2000; i <= 0x200B; i++)
1439 AddCharMap ((char) i, 0x7, 1);
1441 fillIndex [0x7] = 0x17;
1442 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1443 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1445 // Characters which used to represent layout control.
1446 // LAMESPEC: Windows developers seem to have thought
1447 // that those characters are kind of whitespaces,
1448 // while they aren't.
1449 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1450 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1453 // FIXME: 09 should be more complete.
1454 fillIndex [0x9] = 2;
1456 for (int cp = 0x2300; cp <= 0x237A; cp++)
1457 AddCharMap ((char) cp, 0x9, 1, 0);
1460 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1461 foreach (DictionaryEntry de in arrowValues) {
1462 int idx = (int) de.Value;
1463 int cp = (int) de.Key;
1464 if (map [cp].Defined)
1466 fillIndex [0x9] = (byte) (0xD8 + idx);
1467 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1471 byte [] boxLv2 = new byte [128];
1472 for (int i = 0; i < boxLv2.Length; i++)
1474 foreach (DictionaryEntry de in boxValues) {
1475 int cp = (int) de.Key;
1476 int idx = (int) de.Value;
1477 if (map [cp].Defined)
1479 fillIndex [0x9] = (byte) (0xE5 + idx);
1480 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1483 // Some special characters (slanted)
1484 fillIndex [0x9] = 0xF4;
1485 AddCharMap ('\u2571', 0x9, 3);
1486 AddCharMap ('\u2572', 0x9, 3);
1487 AddCharMap ('\u2573', 0x9, 3);
1489 // FIXME: implement 0A
1491 fillIndex [0xA] = 2;
1492 // byte currency symbols
1493 for (int cp = 0; cp < 0x100; cp++) {
1494 uc = Char.GetUnicodeCategory ((char) cp);
1495 if (!IsIgnorable (cp) &&
1496 uc == UnicodeCategory.CurrencySymbol &&
1498 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1500 // byte other symbols
1501 for (int cp = 0; cp < 0x100; cp++) {
1503 continue; // SPECIAL: skip FIXME: why?
1504 uc = Char.GetUnicodeCategory ((char) cp);
1505 if (!IsIgnorable (cp) &&
1506 uc == UnicodeCategory.OtherSymbol)
1507 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1510 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1511 for (int cp = 0x2600; cp <= 0x2613; cp++)
1512 AddCharMap ((char) cp, 0xA, 1, 0);
1514 for (int cp = 0x2620; cp <= 0x2770; cp++)
1515 if (Char.IsSymbol ((char) cp))
1516 AddCharMap ((char) cp, 0xA, 1, 0);
1518 for (int i = 0x2440; i < 0x2460; i++)
1519 AddCharMap ((char) i, 0xA, 1, 0);
1523 #region Numbers // 0C 02 - 0C E1
1524 fillIndex [0xC] = 2;
1526 // 9F8 : Bengali "one less than the denominator"
1527 AddCharMap ('\u09F8', 0xC, 1);
1529 ArrayList numbers = new ArrayList ();
1530 for (int i = 0; i < 65536; i++)
1531 if (!IsIgnorable (i) &&
1532 Char.IsNumber ((char) i) &&
1533 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1536 ArrayList numberValues = new ArrayList ();
1537 foreach (int i in numbers)
1538 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1539 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1541 //foreach (DictionaryEntry de in numberValues)
1542 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1544 decimal prevValue = -1;
1545 foreach (DictionaryEntry de in numberValues) {
1546 int cp = (int) de.Key;
1547 decimal currValue = (decimal) de.Value;
1548 bool addnew = false;
1549 if (prevValue < currValue &&
1550 prevValue - (int) prevValue == 0 &&
1554 // Process Hangzhou and Roman numbers
1556 // There are some SPECIAL cases.
1557 if (currValue != 4) // no increment for 4
1561 xcp = (int) prevValue + 0x2170 - 1;
1562 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1563 xcp = (int) prevValue + 0x2160 - 1;
1564 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1565 fillIndex [0xC] += 2;
1566 xcp = (int) prevValue + 0x3021 - 1;
1567 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1570 if (prevValue < currValue)
1571 prevValue = currValue;
1572 if (map [cp].Defined)
1574 // HangZhou and Roman are add later
1576 else if (0x3021 <= cp && cp < 0x302A
1577 || 0x2160 <= cp && cp < 0x216A
1578 || 0x2170 <= cp && cp < 0x217A)
1581 if (cp == 0x215B) // FIXME: why?
1582 fillIndex [0xC] += 2;
1583 else if (cp == 0x3021) // FIXME: why?
1585 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1587 if (addnew || cp <= '9') {
1589 if (1 <= currValue && currValue <= 10) {
1590 xcp = cp - 0x31 + 0x2776;
1591 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1592 xcp = cp - 0x31 + 0x2780;
1593 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1594 xcp = cp - 0x31 + 0x278A;
1595 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1597 if (1 <= currValue && currValue <= 20) {
1598 xcp = cp - 0x31 + 0x2460;
1599 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1600 xcp = cp - 0x31 + 0x2474;
1601 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1602 xcp = cp - 0x31 + 0x2488;
1603 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1607 if (cp != 0x09E7 && cp != 0x09EA)
1610 // Add special cases that are not regarded as
1611 // numbers in UnicodeCategory speak.
1614 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1615 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1617 else if (cp == '6') // FIXME: why?
1622 fillIndex [0xC] = 0xFF;
1623 AddCharMap ('\u221E', 0xC, 1);
1626 #region Letters and NonSpacing Marks (general)
1628 // ASCII Latin alphabets
1629 for (int i = 0; i < alphabets.Length; i++)
1630 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1633 // non-ASCII Latin alphabets
1634 // FIXME: there is no such characters that are placed
1635 // *after* "alphabets" array items. This is nothing
1636 // more than a hack that creates dummy weight for
1637 // primary characters.
1638 for (int i = 0x0080; i < 0x0300; i++) {
1639 if (!Char.IsLetter ((char) i))
1641 // For those Latin Letters which has NFKD are
1642 // not added as independent primary character.
1643 if (decompIndex [i] != 0)
1646 // 1.some alphabets have primarily
1647 // equivalent ASCII alphabets.
1648 // 2.some have independent primary weights,
1649 // but inside a-to-z range.
1650 // 3.there are some expanded characters that
1651 // are not part of Unicode Standard NFKD.
1653 // 1. skipping them does not make sense
1654 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1655 // case 0x184: case 0x185: case 0x186: case 0x189:
1656 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1657 // case 0x194: case 0x195: case 0x196: case 0x19A:
1658 // case 0x19B: case 0x19C:
1659 // 2. skipping them does not make sense
1660 // case 0x14A: // Ng
1661 // case 0x14B: // ng
1665 case 0xDE: // Icelandic Thorn
1666 case 0xFE: // Icelandic Thorn
1667 case 0xDF: // German ss
1668 case 0xFF: // German ss
1669 // not classified yet
1670 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1671 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1672 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1673 // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1677 AddCharMapGroup ((char) i, 0xE, 1, 0);
1681 fillIndex [0xF] = 02;
1682 for (int i = 0x0380; i < 0x0390; i++)
1683 if (Char.IsLetter ((char) i))
1684 AddLetterMap ((char) i, 0xF, 1);
1685 fillIndex [0xF] = 02;
1686 for (int i = 0x0391; i < 0x03CF; i++)
1687 if (Char.IsLetter ((char) i))
1688 AddLetterMap ((char) i, 0xF, 1);
1689 fillIndex [0xF] = 0x40;
1690 for (int i = 0x03D0; i < 0x0400; i++)
1691 if (Char.IsLetter ((char) i))
1692 AddLetterMap ((char) i, 0xF, 1);
1694 // Cyrillic - character name order
1695 fillIndex [0x10] = 0x6;
1697 for (int i = 0; i < orderedCyrillic.Length; i++)
1698 Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
1700 // table which is moslty from UCA DUCET.
1701 for (int i = 0; i < orderedCyrillic.Length; i++) {
1702 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
1703 if (!IsIgnorable ((int) c) &&
1705 Char.IsLetter (c)) {
1706 AddLetterMap (c, 0x10, 0);
1707 fillIndex [0x10] += 3;
1711 for (int i = 0x0460; i < 0x0481; i++) {
1712 if (Char.IsLetter ((char) i)) {
1713 AddLetterMap ((char) i, 0x10, 0);
1714 fillIndex [0x10] += 3;
1719 for (int i = 0x0400; i <= 0x0486; i++) {
1720 if (!Char.IsLetter ((char) i)) {
1721 // AddCharMap ((char) i, 0x1, 1);
1724 if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
1725 Console.Error.WriteLine ("no value for {0:x04}", i);
1729 (byte) cyrillicLetterPrimaryValues [i];
1730 AddLetterMap ((char) i, 0x10, 0);
1735 fillIndex [0x11] = 0x3;
1736 for (int i = 0x0531; i < 0x0586; i++)
1737 if (Char.IsLetter ((char) i))
1738 AddLetterMap ((char) i, 0x11, 1);
1742 fillIndex [0x12] = 0x3;
1743 for (int i = 0x05D0; i < 0x05FF; i++)
1744 if (Char.IsLetter ((char) i))
1745 AddLetterMap ((char) i, 0x12, 1);
1747 fillIndex [0x1] = 0x3;
1748 for (int i = 0x0591; i <= 0x05C2; i++)
1750 AddCharMap ((char) i, 0x1, 1);
1753 fillIndex [0x1] = 0x8E;
1754 fillIndex [0x13] = 0x3;
1755 for (int i = 0x0621; i <= 0x064A; i++) {
1757 if (Char.GetUnicodeCategory ((char) i)
1758 != UnicodeCategory.OtherLetter) {
1759 // FIXME: arabic nonspacing marks are
1760 // in different order.
1761 AddCharMap ((char) i, 0x1, 1);
1764 // map [i] = new CharMapEntry (0x13,
1765 // (byte) arabicLetterPrimaryValues [i], 1);
1767 (byte) arabicLetterPrimaryValues [i];
1768 AddLetterMap ((char) i, 0x13, 0);
1770 fillIndex [0x13] = 0x84;
1771 for (int i = 0x0674; i < 0x06D6; i++)
1772 if (Char.IsLetter ((char) i))
1773 AddLetterMap ((char) i, 0x13, 1);
1776 // FIXME: it does seem straight codepoint mapping.
1777 fillIndex [0x14] = 04;
1778 for (int i = 0x0901; i < 0x0905; i++)
1779 if (!IsIgnorable (i))
1780 AddLetterMap ((char) i, 0x14, 2);
1781 fillIndex [0x14] = 0xB;
1782 for (int i = 0x0905; i < 0x093A; i++)
1783 if (Char.IsLetter ((char) i))
1784 AddLetterMap ((char) i, 0x14, 4);
1785 for (int i = 0x093E; i < 0x094F; i++)
1786 if (!IsIgnorable (i))
1787 AddLetterMap ((char) i, 0x14, 2);
1791 fillIndex [0x15] = 02;
1792 for (int i = 0x0980; i < 0x9FF; i++) {
1793 if (IsIgnorable (i))
1796 fillIndex [0x15] = 0x3B;
1797 switch (Char.GetUnicodeCategory ((char) i)) {
1798 case UnicodeCategory.NonSpacingMark:
1799 case UnicodeCategory.DecimalDigitNumber:
1800 case UnicodeCategory.OtherNumber:
1803 AddLetterMap ((char) i, 0x15, 1);
1806 fillIndex [0x1] = 0x3;
1807 for (int i = 0x0981; i < 0x0A00; i++)
1808 if (Char.GetUnicodeCategory ((char) i) ==
1809 UnicodeCategory.NonSpacingMark)
1810 AddCharMap ((char) i, 0x1, 1);
1812 // Gurmukhi. orderedGurmukhi is from UCA
1813 // FIXME: it does not look equivalent to UCA.
1814 fillIndex [0x1] = 03;
1815 fillIndex [0x16] = 02;
1816 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1817 char c = orderedGurmukhi [i];
1818 if (IsIgnorable ((int) c))
1820 if (!Char.IsLetter (c)) {
1821 AddLetterMap (c, 0x1, 1);
1824 if (c == '\u0A3C' || c == '\u0A4D' ||
1825 '\u0A66' <= c && c <= '\u0A71')
1827 AddLetterMap (c, 0x16, 4);
1830 // Gujarati. orderedGujarati is from UCA
1831 fillIndex [0x17] = 02;
1832 for (int i = 0; i < orderedGujarati.Length; i++)
1833 AddLetterMap (orderedGujarati [i], 0x17, 4);
1836 fillIndex [0x18] = 02;
1837 for (int i = 0x0B00; i < 0x0B7F; i++) {
1838 switch (Char.GetUnicodeCategory ((char) i)) {
1839 case UnicodeCategory.NonSpacingMark:
1840 case UnicodeCategory.DecimalDigitNumber:
1843 AddLetterMap ((char) i, 0x18, 1);
1847 fillIndex [0x19] = 2;
1848 AddCharMap ('\u0BD7', 0x19, 0);
1849 fillIndex [0x19] = 0xA;
1851 for (int i = 0x0BD7; i < 0x0B94; i++)
1852 if (Char.IsLetter ((char) i))
1853 AddCharMap ((char) i, 0x19, 2);
1855 fillIndex [0x19] = 0x24;
1856 AddCharMap ('\u0B94', 0x19, 0);
1857 fillIndex [0x19] = 0x26;
1858 // The array for Tamil consonants is a constant.
1859 // Windows have almost similar sequence to TAM from
1860 // tamilnet but a bit different in Grantha.
1861 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1862 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1864 fillIndex [0x19] = 0x82;
1865 for (int i = 0x0BBE; i < 0x0BCD; i++)
1866 if (Char.GetUnicodeCategory ((char) i) ==
1867 UnicodeCategory.SpacingCombiningMark
1869 AddLetterMap ((char) i, 0x19, 2);
1872 fillIndex [0x1A] = 0x4;
1873 for (int i = 0x0C00; i < 0x0C62; i++) {
1874 if (i == 0x0C55 || i == 0x0C56)
1876 AddCharMap ((char) i, 0x1A, 3);
1877 char supp = (i == 0x0C0B) ? '\u0C60':
1878 i == 0x0C0C ? '\u0C61' : char.MinValue;
1879 if (supp == char.MinValue)
1881 AddCharMap (supp, 0x1A, 3);
1885 fillIndex [0x1B] = 4;
1886 for (int i = 0x0C80; i < 0x0CE5; i++) {
1887 if (i == 0x0CD5 || i == 0x0CD6)
1889 AddCharMap ((char) i, 0x1B, 3);
1893 fillIndex [0x1C] = 2;
1894 for (int i = 0x0D02; i < 0x0D61; i++)
1895 // FIXME: I avoided MSCompatUnicodeTable usage
1896 // here (it results in recursion). So check if
1897 // using NonSpacingMark makes sense or not.
1898 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1899 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1900 AddCharMap ((char) i, 0x1C, 1);
1902 // Thai ... note that it breaks 0x1E wall after E2B!
1903 // Also, all Thai characters have level 2 value 3.
1904 fillIndex [0x1E] = 2;
1905 for (int i = 0xE44; i < 0xE48; i++)
1906 AddCharMap ((char) i, 0x1E, 1, 3);
1907 for (int i = 0xE01; i < 0xE2B; i++)
1908 AddCharMap ((char) i, 0x1E, 6, 0);
1909 fillIndex [0x1F] = 5;
1910 for (int i = 0xE2B; i < 0xE30; i++)
1911 AddCharMap ((char) i, 0x1F, 6, 0);
1912 for (int i = 0xE30; i < 0xE3B; i++)
1913 AddCharMap ((char) i, 0x1F, 1, 3);
1914 // some Thai characters remains.
1915 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1916 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1917 foreach (char c in specialThai)
1918 AddCharMap (c, 0x1F, 1);
1921 fillIndex [0x1F] = 2;
1922 for (int i = 0xE80; i < 0xEDF; i++)
1923 if (Char.IsLetter ((char) i))
1924 AddCharMap ((char) i, 0x1F, 1);
1926 // Georgian. orderedGeorgian is from UCA DUCET.
1927 fillIndex [0x21] = 5;
1928 for (int i = 0; i < orderedGeorgian.Length; i++)
1929 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1932 fillIndex [0x22] = 2;
1933 int kanaOffset = 0x3041;
1934 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1936 for (int gyo = 0; gyo < 9; gyo++) {
1937 for (int dan = 0; dan < 5; dan++) {
1938 if (gyo == 7 && dan % 2 == 1) {
1941 kanaOffset -= 2; // There is no space for yi and ye.
1944 int cp = kanaOffset + dan * kanaLines [gyo];
1945 // small lines (a-gyo, ya-gyo)
1946 if (gyo == 0 || gyo == 7) {
1947 AddKanaMap (cp, 1); // small
1948 AddKanaMap (cp + 1, 1);
1951 AddKanaMap (cp, kanaLines [gyo]);
1955 // add small 'Tsu' (before normal one)
1956 AddKanaMap (0x3063, 1);
1960 fillIndex [0x22] += 3;
1961 kanaOffset += 5 * kanaLines [gyo];
1964 // Wa-gyo is almost special, so I just manually add.
1965 AddLetterMap ((char) 0x308E, 0x22, 0);
1966 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1967 AddLetterMap ((char) 0x308F, 0x22, 0);
1968 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1970 AddLetterMap ((char) 0x3090, 0x22, 0);
1971 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1972 fillIndex [0x22] += 2;
1973 // no "Wu" in Japanese.
1974 AddLetterMap ((char) 0x3091, 0x22, 0);
1975 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1977 AddLetterMap ((char) 0x3092, 0x22, 0);
1978 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1980 fillIndex [0x22] = 0x80;
1981 AddLetterMap ((char) 0x3093, 0x22, 0);
1982 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1984 // JIS Japanese square chars.
1985 fillIndex [0x22] = 0x97;
1986 jisJapanese.Sort (JISComparer.Instance);
1987 foreach (JISCharacter j in jisJapanese)
1988 AddCharMap ((char) j.CP, 0x22, 1);
1989 // non-JIS Japanese square chars.
1990 nonJisJapanese.Sort (NonJISComparer.Instance);
1991 foreach (NonJISCharacter j in nonJisJapanese)
1992 AddCharMap ((char) j.CP, 0x22, 1);
1995 fillIndex [0x23] = 0x02;
1996 for (int i = 0x3105; i <= 0x312C; i++)
1997 AddCharMap ((char) i, 0x23, 1);
1999 // Estrangela: ancient Syriac
2000 fillIndex [0x24] = 0x0B;
2001 // FIXME: is 0x71E really alternative form?
2002 ArrayList syriacAlternatives = new ArrayList (
2003 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2004 for (int i = 0x0710; i <= 0x072C; i++) {
2005 if (i == 0x0711) // NonSpacingMark
2007 if (syriacAlternatives.Contains (i))
2009 AddCharMap ((char) i, 0x24, 4);
2014 foreach (int cp in syriacAlternatives)
2015 map [cp] = new CharMapEntry (0x24,
2016 (byte) (map [cp - 1].Level1 + 2),
2020 // FIXME: it turned out that it does not look like UCA
2021 fillIndex [0x24] = 0x6E;
2022 for (int i = 0; i < orderedThaana.Length; i++) {
2023 if (IsIgnorableNonSpacing (i))
2025 AddCharMap (orderedThaana [i], 0x24, 2);
2029 // FIXME: Add more culture-specific letters (that are
2030 // not supported in Windows collation) here.
2032 // Surrogate ... they are computed.
2037 // Unlike UCA Windows Hangul sequence mixes Jongseong
2038 // with Choseong sequence as well as Jungseong,
2039 // adjusted to have the same primary weight for the
2040 // same base character. So it is impossible to compute
2043 // Here I introduce an ordered sequence of mixed
2044 // 'commands' and 'characters' that is similar to
2046 // - ',' increases primary weight.
2047 // - [A B] means a range, increasing index
2048 // - {A B} means a range, without increasing index
2049 // - '=' is no operation (it means the characters
2050 // of both sides have the same weight).
2051 // - '>' inserts a Hangul Syllable block that
2052 // contains 0x251 characters.
2053 // - '<' decreases the index
2054 // - '0'-'9' means skip count
2055 // - whitespaces are ignored
2058 string hangulSequence =
2059 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2060 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2061 + "<{\u1113 \u1116}, \u3165,"
2062 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2063 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2064 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2065 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2066 + "[\u11D1 \u11D2], \u11B2,"
2067 + "[\u11D3 \u11D5], \u11B3,"
2068 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2069 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2070 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2071 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2072 + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2073 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2074 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2075 + "\u11EA,, \u110A=\u11BB,,, >"
2076 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2077 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2078 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2079 + "\u11F1,, \u11F2,,,"
2080 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2081 + "<\u114D, \u110D,, >"
2082 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2083 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2084 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2085 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2086 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2090 byte hangulCat = 0x52;
2091 fillIndex [hangulCat] = 0x2;
2093 int syllableBlock = 0;
2094 for (int n = 0; n < hangulSequence.Length; n++) {
2095 char c = hangulSequence [n];
2097 if (Char.IsWhiteSpace (c))
2103 IncrementSequentialIndex (ref hangulCat);
2106 if (fillIndex [hangulCat] == 2)
2107 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2108 fillIndex [hangulCat]--;
2111 IncrementSequentialIndex (ref hangulCat);
2112 for (int l = 0; l < 0x15; l++)
2113 for (int v = 0; v < 0x1C; v++) {
2115 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2116 IncrementSequentialIndex (ref hangulCat);
2121 start = hangulSequence [n + 1];
2122 end = hangulSequence [n + 3];
2123 for (int i = start; i <= end; i++) {
2124 AddCharMap ((char) i, hangulCat, 0);
2126 IncrementSequentialIndex (ref hangulCat);
2128 n += 4; // consumes 5 characters for this operation
2131 start = hangulSequence [n + 1];
2132 end = hangulSequence [n + 3];
2133 for (int i = start; i <= end; i++)
2134 AddCharMap ((char) i, hangulCat, 0);
2135 n += 4; // consumes 5 characters for this operation
2138 AddCharMap (c, hangulCat, 0);
2144 for (int i = 0x3200; i < 0x3300; i++) {
2145 if (IsIgnorable (i) || map [i].Defined)
2149 if (decompLength [i] == 4 &&
2150 decompValues [decompIndex [i]] == '(')
2151 ch = decompIndex [i] + 1;
2153 else if (decompLength [i] == 2 &&
2154 decompValues [decompIndex [i] + 1] == '\u1161')
2155 ch = decompIndex [i];
2156 else if (decompLength [i] == 1)
2157 ch = decompIndex [i];
2160 ch = decompValues [ch];
2161 if (ch < 0x1100 || 0x1200 < ch &&
2162 ch < 0xAC00 || 0xD800 < ch)
2164 map [i] = new CharMapEntry (map [ch].Category,
2165 (byte) (map [ch].Level1 + 1),
2167 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2173 // Letterlike characters and CJK compatibility square
2174 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2175 int [] counts = new int ['Z' - 'A' + 1];
2176 char [] namedChars = new char [sortableCharNames.Count];
2178 foreach (DictionaryEntry de in sortableCharNames) {
2179 counts [((string) de.Value) [0] - 'A']++;
2180 namedChars [nCharNames++] = (char) ((int) de.Key);
2182 nCharNames = 0; // reset
2183 for (int a = 0; a < counts.Length; a++) {
2184 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2185 for (int i = 0; i < counts [a]; i++)
2186 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2187 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2190 // CJK unified ideograph.
2192 fillIndex [cjkCat] = 0x2;
2193 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2194 if (!IsIgnorable (cp))
2195 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2196 // CJK Extensions goes here.
2197 // LAMESPEC: With this Windows style CJK layout, it is
2198 // impossible to add more CJK ideograph i.e. 0x9FA6-
2199 // 0x9FBB can never be added w/o breaking compat.
2200 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2201 if (!IsIgnorable (cp))
2202 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2204 // PrivateUse ... computed.
2205 // remaining Surrogate ... computed.
2207 #region Special "biggest" area (FF FF)
2208 fillIndex [0xFF] = 0xFF;
2209 char [] specialBiggest = new char [] {
2210 '\u3005', '\u3031', '\u3032', '\u309D',
2211 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2212 '\uFE7C', '\uFE7D', '\uFF70'};
2213 foreach (char c in specialBiggest)
2214 AddCharMap (c, 0xFF, 0);
2217 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2218 // non-alphanumeric ASCII except for: + - < = > '
2219 for (int i = 0x21; i < 0x7F; i++) {
2220 if (Char.IsLetterOrDigit ((char) i)
2221 || "+-<=>'".IndexOf ((char) i) >= 0)
2222 continue; // they are not added here.
2223 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2224 // Insert 3001 after ',' and 3002 after '.'
2226 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2227 else if (i == 0x2E) {
2229 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2232 AddCharMap ('\uFE30', 0x7, 1, 0);
2236 #region 07 - Punctuations and something else
2237 for (int i = 0xA0; i < char.MaxValue; i++) {
2238 if (IsIgnorable (i))
2250 switch (Char.GetUnicodeCategory ((char) i)) {
2251 case UnicodeCategory.OtherPunctuation:
2252 case UnicodeCategory.ClosePunctuation:
2253 case UnicodeCategory.OpenPunctuation:
2254 case UnicodeCategory.InitialQuotePunctuation:
2255 case UnicodeCategory.FinalQuotePunctuation:
2256 case UnicodeCategory.ModifierSymbol:
2257 // SPECIAL CASES: // 0xA
2258 if (0x2020 <= i && i <= 0x2042)
2260 AddCharMapGroup ((char) i, 0x7, 1, 0);
2263 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2264 goto case UnicodeCategory.OtherPunctuation;
2269 for (int i = 0x2400; i <= 0x2421; i++)
2270 AddCharMap ((char) i, 0x7, 1, 0);
2273 // FIXME: for 07 xx we need more love.
2275 // FIXME: 08 should be more complete.
2276 fillIndex [0x8] = 2;
2277 for (int cp = 0; cp < char.MaxValue; cp++)
2278 if (!map [cp].Defined &&
2279 Char.GetUnicodeCategory ((char) cp) ==
2280 UnicodeCategory.MathSymbol)
2281 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2283 // Characters w/ diacritical marks (NFKD)
2284 for (int i = 0; i <= char.MaxValue; i++) {
2285 if (map [i].Defined || IsIgnorable (i))
2287 if (decompIndex [i] == 0)
2290 int start = decompIndex [i];
2291 int primaryChar = decompValues [start];
2294 int length = decompLength [i];
2295 // special processing for parenthesized ones.
2297 decompValues [start] == '(' &&
2298 decompValues [start + 2] == ')') {
2299 primaryChar = decompValues [start + 1];
2303 if (map [primaryChar].Level1 == 0)
2306 for (int l = 1; l < length; l++) {
2307 int c = decompValues [start + l];
2308 if (map [c].Level1 != 0)
2310 secondary += diacritical [c];
2314 map [i] = new CharMapEntry (
2315 map [primaryChar].Category,
2316 map [primaryChar].Level1,
2321 #region Level2 adjustment
2323 diacritical [0x624] = 0x5;
2324 diacritical [0x626] = 0x7;
2325 diacritical [0x622] = 0x9;
2326 diacritical [0x623] = 0xA;
2327 diacritical [0x625] = 0xB;
2328 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2329 diacritical [0x64A] = 0x7; // Yaa'
2332 for (int i = 0; i < char.MaxValue; i++) {
2334 byte cat = map [i].Category;
2336 case 0xE: // Latin diacritics
2337 case 0x22: // Japanese: circled characters
2338 mod = diacritical [i];
2340 case 0x13: // Arabic
2341 if (diacritical [i] == 0)
2342 mod = 0x8; // default for arabic
2345 if (0x52 <= cat && cat <= 0x7F) // Hangul
2346 mod = diacritical [i];
2348 map [i] = new CharMapEntry (
2349 cat, map [i].Level1, mod);
2353 // FIXME: this is hack but those which are
2354 // NonSpacingMark characters and still undefined
2355 // are likely to be nonspacing.
2356 for (int i = 0; i < char.MaxValue; i++)
2357 if (!map [i].Defined &&
2359 Char.GetUnicodeCategory ((char) i) ==
2360 UnicodeCategory.NonSpacingMark)
2361 AddCharMap ((char) i, 1, 1);
2364 private void IncrementSequentialIndex (ref byte hangulCat)
2366 fillIndex [hangulCat]++;
2367 if (fillIndex [hangulCat] == 0) { // overflown
2369 fillIndex [hangulCat] = 0x2;
2373 // Reset fillIndex to fixed value and call AddLetterMap().
2374 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2376 fillIndex [category] = alphaWeight;
2377 AddLetterMap (c, category, 0);
2379 ArrayList al = latinMap [c] as ArrayList;
2383 foreach (int cp in al)
2384 AddLetterMap ((char) cp, category, 0);
2387 private void AddKanaMap (int i, byte voices)
2389 for (byte b = 0; b < voices; b++) {
2390 char c = (char) (i + b);
2391 byte arg = (byte) (b > 0 ? b + 2 : 0);
2393 AddLetterMapCore (c, 0x22, 0, arg);
2395 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2399 private void AddLetterMap (char c, byte category, byte updateCount)
2401 AddLetterMapCore (c, category, updateCount, 0);
2404 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2407 // <small> updates index
2408 c2 = ToSmallForm (c);
2410 AddCharMapGroup (c2, category, updateCount, level2);
2411 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2412 if (c2 != c && !map [(int) c2].Defined)
2413 AddLetterMapCore (c2, category, 0, level2);
2414 bool doUpdate = true;
2415 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2418 AddCharMapGroup (c, category, 0, level2);
2420 fillIndex [category] += updateCount;
2423 private bool AddCharMap (char c, byte category, byte increment)
2425 return AddCharMap (c, category, increment, 0);
2428 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2430 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2431 return false; // do nothing
2432 map [(int) c] = new CharMapEntry (category,
2433 category == 1 ? alt : fillIndex [category],
2434 category == 1 ? fillIndex [category] : alt);
2435 fillIndex [category] += increment;
2439 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2441 char c2 = ToSmallFormTail (c);
2443 AddCharMap (c2, category, updateCount, 0);
2445 AddCharMap (c, category, updateCount, 0);
2447 c2 = ToFullWidthTail (c);
2449 AddCharMapGroupTail (c2, category, updateCount);
2453 // Adds characters to table in the order below
2454 // (+ increases weight):
2458 // <full> | <super> | <sub>
2459 // <circle> | <wide> (| <narrow>)
2463 // level2 is fixed (does not increase).
2464 int [] sameWeightItems = new int [] {
2465 DecompositionFraction,
2469 DecompositionCircle,
2471 DecompositionNarrow,
2473 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2475 if (map [(int) c].Defined)
2478 char small = char.MinValue;
2479 char vertical = char.MinValue;
2480 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2482 object smv = nfkd [(byte) DecompositionSmall];
2484 small = (char) ((int) smv);
2485 object vv = nfkd [(byte) DecompositionVertical];
2487 vertical = (char) ((int) vv);
2490 // <small> updates index
2491 if (small != char.MinValue)
2492 AddCharMap (small, category, updateCount);
2495 AddCharMap (c, category, 0, level2);
2498 foreach (int weight in sameWeightItems) {
2499 object wv = nfkd [(byte) weight];
2501 AddCharMap ((char) ((int) wv), category, 0, level2);
2505 // update index here.
2506 fillIndex [category] += updateCount;
2508 if (vertical != char.MinValue)
2509 AddCharMap (vertical, category, updateCount, level2);
2512 private void AddCharMapCJK (char c, ref byte category)
2514 AddCharMap (c, category, 0, 0);
2515 IncrementSequentialIndex (ref category);
2517 // Special. I wonder why but Windows skips 9E F9.
2518 if (category == 0x9E && fillIndex [category] == 0xF9)
2519 IncrementSequentialIndex (ref category);
2522 private void AddCharMapGroupCJK (char c, ref byte category)
2524 AddCharMapCJK (c, ref category);
2526 // LAMESPEC: see below.
2527 if (c == '\u52DE') {
2528 AddCharMapCJK ('\u3298', ref category);
2529 AddCharMapCJK ('\u3238', ref category);
2532 AddCharMapCJK ('\u32A2', ref category);
2534 // Especially this mapping order totally does
2535 // not make sense to me.
2536 AddCharMapCJK ('\u32A9', ref category);
2538 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2541 for (byte weight = 0; weight <= 0x12; weight++) {
2542 object wv = nfkd [weight];
2547 // Special: they are ignored in this area.
2548 // FIXME: check if it is sane
2549 if (0xF900 <= w && w <= 0xFAD9)
2551 // LAMESPEC: on Windows some of CJK characters
2552 // in 3200-32B0 are incorrectly mapped. They
2553 // mix Chinise and Japanese Kanji when
2554 // ordering those characters.
2556 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2560 AddCharMapCJK ((char) w, ref category);
2564 // For now it is only for 0x7 category.
2565 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2567 char small = char.MinValue;
2568 char vertical = char.MinValue;
2569 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2571 object smv = nfkd [(byte) DecompositionSmall];
2573 small = (char) ((int) smv);
2574 object vv = nfkd [(byte) DecompositionVertical];
2576 vertical = (char) ((int) vv);
2579 // <small> updates index
2580 if (small != char.MinValue)
2581 // SPECIAL CASE excluded (FIXME: why?)
2582 if (small != '\u2024')
2583 AddCharMap (small, category, updateCount);
2586 AddCharMap (c, category, updateCount, level2);
2588 // Since nfkdMap is problematic to have two or more
2589 // NFKD to an identical character, here I iterate all.
2590 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2591 if (decompLength [c2] == 1 &&
2592 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2593 switch (decompType [c2]) {
2594 case DecompositionCompat:
2595 AddCharMap ((char) c2, category, updateCount, level2);
2601 if (vertical != char.MinValue)
2602 // SPECIAL CASE excluded (FIXME: why?)
2603 if (vertical != '\uFE33' && vertical != '\uFE34')
2604 AddCharMap (vertical, category, updateCount, level2);
2607 private void AddArabicCharMap (char c)
2610 byte updateCount = 1;
2614 AddCharMap (c, category, 0, level2);
2616 // Since nfkdMap is problematic to have two or more
2617 // NFKD to an identical character, here I iterate all.
2618 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2619 if (decompLength [c2] == 0)
2621 int idx = decompIndex [c2] + decompLength [c2] - 1;
2622 if ((int) (decompValues [idx]) == (int) c)
2623 AddCharMap ((char) c2, category,
2626 fillIndex [category] += updateCount;
2629 char ToFullWidth (char c)
2631 return ToDecomposed (c, DecompositionFull, false);
2634 char ToFullWidthTail (char c)
2636 return ToDecomposed (c, DecompositionFull, true);
2639 char ToSmallForm (char c)
2641 return ToDecomposed (c, DecompositionSmall, false);
2644 char ToSmallFormTail (char c)
2646 return ToDecomposed (c, DecompositionSmall, true);
2649 char ToDecomposed (char c, byte d, bool tail)
2651 if (decompType [(int) c] != d)
2653 int idx = decompIndex [(int) c];
2655 idx += decompLength [(int) c] - 1;
2656 return (char) decompValues [idx];
2659 bool ExistsJIS (int cp)
2661 foreach (JISCharacter j in jisJapanese)
2669 #region Level 3 properties (Case/Width)
2671 private byte ComputeLevel3Weight (char c)
2673 byte b = ComputeLevel3WeightRaw (c);
2674 return b > 0 ? (byte) (b + 2) : b;
2677 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2680 if ('\u11A8' <= c && c <= '\u11F9')
2682 if ('\uFFA0' <= c && c <= '\uFFDC')
2684 if ('\u3130' <= c && c <= '\u3164')
2687 if ('\u2776' <= c && c <= '\u277F')
2689 if ('\u2780' <= c && c <= '\u2789')
2691 if ('\u2776' <= c && c <= '\u2793')
2693 if ('\u2160' <= c && c <= '\u216F')
2695 if ('\u2181' <= c && c <= '\u2182')
2698 if ('\u2135' <= c && c <= '\u2138')
2700 if ('\uFE80' <= c && c < '\uFE8E') {
2701 // 2(Isolated)/8(Final)/0x18(Medial)
2702 switch (decompType [(int) c]) {
2703 case DecompositionIsolated:
2705 case DecompositionFinal:
2707 case DecompositionMedial:
2712 // actually I dunno the reason why they have weights.
2735 switch (decompType [(int) c]) {
2736 case DecompositionWide: // <wide>
2737 case DecompositionSub: // <sub>
2738 case DecompositionSuper: // <super>
2739 ret |= decompType [(int) c];
2742 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2744 if (isUppercase [(int) c]) // DerivedCoreProperties
2754 static bool IsIgnorable (int i)
2756 if (unicodeAge [i] >= 3.1)
2758 switch (char.GetUnicodeCategory ((char) i)) {
2759 case UnicodeCategory.OtherNotAssigned:
2760 case UnicodeCategory.Format:
2767 // FIXME: In the future use DerivedAge.txt to examine character
2768 // versions and set those ones that have higher version than
2769 // 1.0 as ignorable.
2770 static bool IsIgnorable (int i)
2774 // I guess, those characters are added between
2775 // Unicode 1.0 (LCMapString) and Unicode 3.1
2776 // (UnicodeCategory), so they used to be
2777 // something like OtherNotAssigned as of Unicode 1.1.
2778 case 0x2df: case 0x387:
2779 case 0x3d7: case 0x3d8: case 0x3d9:
2780 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2781 case 0x400: case 0x40d: case 0x450: case 0x45d:
2782 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2783 case 0x653: case 0x654: case 0x655: case 0x66d:
2785 case 0x1e9b: case 0x202f: case 0x20ad:
2786 case 0x20ae: case 0x20af:
2787 case 0x20e2: case 0x20e3:
2788 case 0x2139: case 0x213a: case 0x2183:
2789 case 0x2425: case 0x2426: case 0x2619:
2790 case 0x2670: case 0x2671: case 0x3007:
2791 case 0x3190: case 0x3191:
2792 case 0xfffc: case 0xfffd:
2794 // exceptional characters filtered by the
2795 // following conditions. Originally those exceptional
2796 // ranges are incorrect (they should not be ignored)
2797 // and most of those characters are unfortunately in
2799 case 0x4d8: case 0x4d9:
2800 case 0x4e8: case 0x4e9:
2801 case 0x3036: case 0x303f:
2802 case 0x337b: case 0xfb1e:
2807 // The whole Sinhala characters.
2808 0x0D82 <= i && i <= 0x0DF4
2809 // The whole Tibetan characters.
2810 || 0x0F00 <= i && i <= 0x0FD1
2811 // The whole Myanmar characters.
2812 || 0x1000 <= i && i <= 0x1059
2813 // The whole Etiopic, Cherokee,
2814 // Canadian Syllablic, Ogham, Runic,
2815 // Tagalog, Hanunoo, Philippine,
2816 // Buhid, Tagbanwa, Khmer and Mongorian
2818 || 0x1200 <= i && i <= 0x1DFF
2819 // Greek extension characters.
2820 || 0x1F00 <= i && i <= 0x1FFF
2821 // The whole Braille characters.
2822 || 0x2800 <= i && i <= 0x28FF
2823 // CJK radical characters.
2824 || 0x2E80 <= i && i <= 0x2EF3
2825 // Kangxi radical characters.
2826 || 0x2F00 <= i && i <= 0x2FD5
2827 // Ideographic description characters.
2828 || 0x2FF0 <= i && i <= 0x2FFB
2829 // Bopomofo letter and final
2830 || 0x31A0 <= i && i <= 0x31B7
2831 // White square with quadrant characters.
2832 || 0x25F0 <= i && i <= 0x25F7
2833 // Ideographic telegraph symbols.
2834 || 0x32C0 <= i && i <= 0x32CB
2835 || 0x3358 <= i && i <= 0x3370
2836 || 0x33E0 <= i && i <= 0x33FF
2837 // The whole YI characters.
2838 || 0xA000 <= i && i <= 0xA48C
2839 || 0xA490 <= i && i <= 0xA4C6
2840 // American small ligatures
2841 || 0xFB13 <= i && i <= 0xFB17
2842 // hebrew, arabic, variation selector.
2843 || 0xFB1D <= i && i <= 0xFE2F
2844 // Arabic ligatures.
2845 || 0xFEF5 <= i && i <= 0xFEFC
2846 // FIXME: why are they excluded?
2847 || 0x01F6 <= i && i <= 0x01F9
2848 || 0x0218 <= i && i <= 0x0233
2849 || 0x02A9 <= i && i <= 0x02AD
2850 || 0x02EA <= i && i <= 0x02EE
2851 || 0x0349 <= i && i <= 0x036F
2852 || 0x0488 <= i && i <= 0x048F
2853 || 0x04D0 <= i && i <= 0x04FF
2854 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2855 || 0x06D6 <= i && i <= 0x06ED
2856 || 0x06FA <= i && i <= 0x06FE
2857 || 0x2048 <= i && i <= 0x204D
2858 || 0x20e4 <= i && i <= 0x20ea
2859 || 0x213C <= i && i <= 0x214B
2860 || 0x21EB <= i && i <= 0x21FF
2861 || 0x22F2 <= i && i <= 0x22FF
2862 || 0x237B <= i && i <= 0x239A
2863 || 0x239B <= i && i <= 0x23CF
2864 || 0x24EB <= i && i <= 0x24FF
2865 || 0x2596 <= i && i <= 0x259F
2866 || 0x25F8 <= i && i <= 0x25FF
2867 || 0x2672 <= i && i <= 0x2689
2868 || 0x2768 <= i && i <= 0x2775
2869 || 0x27d0 <= i && i <= 0x27ff
2870 || 0x2900 <= i && i <= 0x2aff
2871 || 0x3033 <= i && i <= 0x303F
2872 || 0x31F0 <= i && i <= 0x31FF
2873 || 0x3250 <= i && i <= 0x325F
2874 || 0x32B1 <= i && i <= 0x32BF
2875 || 0x3371 <= i && i <= 0x337B
2876 || 0xFA30 <= i && i <= 0xFA6A
2880 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2882 case UnicodeCategory.PrivateUse:
2883 case UnicodeCategory.Surrogate:
2885 // ignored by nature
2886 case UnicodeCategory.Format:
2887 case UnicodeCategory.OtherNotAssigned:
2894 // To check IsIgnorable sanity, try the driver below under MS.NET.
2897 public static void Main ()
2899 for (int i = 0; i <= char.MaxValue; i++)
2900 Dump (i, IsIgnorable (i));
2903 static void Dump (int i, bool ignore)
2905 switch (Char.GetUnicodeCategory ((char) i)) {
2906 case UnicodeCategory.PrivateUse:
2907 case UnicodeCategory.Surrogate:
2908 return; // check nothing
2912 string s2 = new string ((char) i, 10);
2913 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2914 if ((ret == 0) == ignore)
2916 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2919 #endregion // IsIgnorable
2921 #region IsIgnorableSymbol
2922 static bool IsIgnorableSymbol (int i)
2924 if (IsIgnorable (i))
2929 case 0x00b5: case 0x01C0: case 0x01C1:
2930 case 0x01C2: case 0x01C3: case 0x01F6:
2931 case 0x01F7: case 0x01F8: case 0x01F9:
2932 case 0x02D0: case 0x02EE: case 0x037A:
2933 case 0x03D7: case 0x03F3:
2934 case 0x0400: case 0x040d:
2935 case 0x0450: case 0x045d:
2936 case 0x048C: case 0x048D:
2937 case 0x048E: case 0x048F:
2938 case 0x0587: case 0x0640: case 0x06E5:
2939 case 0x06E6: case 0x06FA: case 0x06FB:
2940 case 0x06FC: case 0x093D: case 0x0950:
2941 case 0x1E9B: case 0x2139: case 0x3006:
2942 case 0x3033: case 0x3034: case 0x3035:
2943 case 0xFE7E: case 0xFE7F:
2945 case 0x16EE: case 0x16EF: case 0x16F0:
2947 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2948 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2949 case 0x3038: // HANGZHOU NUMERAL TEN
2950 case 0x3039: // HANGZHOU NUMERAL TWENTY
2951 case 0x303a: // HANGZHOU NUMERAL THIRTY
2957 case 0x02B9: case 0x02BA: case 0x02C2:
2958 case 0x02C3: case 0x02C4: case 0x02C5:
2959 case 0x02C8: case 0x02CC: case 0x02CD:
2960 case 0x02CE: case 0x02CF: case 0x02D2:
2961 case 0x02D3: case 0x02D4: case 0x02D5:
2962 case 0x02D6: case 0x02D7: case 0x02DE:
2963 case 0x02E5: case 0x02E6: case 0x02E7:
2964 case 0x02E8: case 0x02E9:
2965 case 0x309B: case 0x309C:
2967 case 0x055A: // American Apos
2968 case 0x05C0: // Hebrew Punct
2969 case 0x0E4F: // Thai FONGMAN
2970 case 0x0E5A: // Thai ANGKHANKHU
2971 case 0x0E5B: // Thai KHOMUT
2973 case 0x09F2: // Bengali Rupee Mark
2974 case 0x09F3: // Bengali Rupee Sign
2976 case 0x221e: // INF.
2985 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2987 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2988 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2993 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2995 case UnicodeCategory.Surrogate:
2996 return false; // inconsistent
2998 case UnicodeCategory.SpacingCombiningMark:
2999 case UnicodeCategory.EnclosingMark:
3000 case UnicodeCategory.NonSpacingMark:
3001 case UnicodeCategory.PrivateUse:
3003 if (0x064B <= i && i <= 0x0652) // Arabic
3007 case UnicodeCategory.Format:
3008 case UnicodeCategory.OtherNotAssigned:
3015 // latin in a circle
3016 0x249A <= i && i <= 0x24E9
3017 || 0x2100 <= i && i <= 0x2132
3019 || 0x3196 <= i && i <= 0x31A0
3021 || 0x3200 <= i && i <= 0x321C
3023 || 0x322A <= i && i <= 0x3243
3025 || 0x3260 <= i && i <= 0x32B0
3026 || 0x32D0 <= i && i <= 0x3357
3027 || 0x337B <= i && i <= 0x33DD
3029 use = !Char.IsLetterOrDigit ((char) i);
3033 // This "Digit" rule is mystery.
3034 // It filters some symbols out.
3035 if (Char.IsLetterOrDigit ((char) i))
3037 if (Char.IsNumber ((char) i))
3039 if (Char.IsControl ((char) i)
3040 || Char.IsSeparator ((char) i)
3041 || Char.IsPunctuation ((char) i))
3043 if (Char.IsSymbol ((char) i))
3046 // FIXME: should check more
3051 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3053 public static void Main ()
3055 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3056 for (int i = 0; i <= char.MaxValue; i++) {
3057 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3058 if (uc == UnicodeCategory.Surrogate)
3061 bool ret = IsIgnorableSymbol (i);
3063 string s1 = "TEST ";
3064 string s2 = "TEST " + (char) i;
3066 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3068 if (ret != (result == 0))
3069 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3070 ret ? "should not ignore" :
3079 static bool IsIgnorableNonSpacing (int i)
3081 if (IsIgnorable (i))
3085 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3086 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3087 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3089 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3090 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3091 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3092 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3093 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3094 case 0x0CCD: case 0x0E4E:
3098 if (0x02b9 <= i && i <= 0x02c5
3099 || 0x02cc <= i && i <= 0x02d7
3100 || 0x02e4 <= i && i <= 0x02ef
3101 || 0x20DD <= i && i <= 0x20E0
3105 if (0x064B <= i && i <= 0x00652
3106 || 0x0941 <= i && i <= 0x0948
3107 || 0x0AC1 <= i && i <= 0x0ACD
3108 || 0x0C3E <= i && i <= 0x0C4F
3109 || 0x0E31 <= i && i <= 0x0E3F
3113 return Char.GetUnicodeCategory ((char) i) ==
3114 UnicodeCategory.NonSpacingMark;
3117 // We can reuse IsIgnorableSymbol testcode
3118 // for IsIgnorableNonSpacing.
3124 public byte Category;
3126 public byte Level2; // It is always single byte.
3127 public bool Defined;
3129 public CharMapEntry (byte category, byte level1, byte level2)
3131 Category = category;
3140 public readonly int CP;
3141 public readonly int JIS;
3143 public JISCharacter (int cp, int cpJIS)
3150 class JISComparer : IComparer
3152 public static readonly JISComparer Instance =
3155 public int Compare (object o1, object o2)
3157 JISCharacter j1 = (JISCharacter) o1;
3158 JISCharacter j2 = (JISCharacter) o2;
3159 return j2.JIS - j1.JIS;
3163 class NonJISCharacter
3165 public readonly int CP;
3166 public readonly string Name;
3168 public NonJISCharacter (int cp, string name)
3175 class NonJISComparer : IComparer
3177 public static readonly NonJISComparer Instance =
3178 new NonJISComparer ();
3180 public int Compare (object o1, object o2)
3182 NonJISCharacter j1 = (NonJISCharacter) o1;
3183 NonJISCharacter j2 = (NonJISCharacter) o2;
3184 return string.CompareOrdinal (j1.Name, j2.Name);
3188 class DecimalDictionaryValueComparer : IComparer
3190 public static readonly DecimalDictionaryValueComparer Instance
3191 = new DecimalDictionaryValueComparer ();
3193 private DecimalDictionaryValueComparer ()
3197 public int Compare (object o1, object o2)
3199 DictionaryEntry e1 = (DictionaryEntry) o1;
3200 DictionaryEntry e2 = (DictionaryEntry) o2;
3201 // FIXME: in case of 0, compare decomposition categories
3202 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3205 int i1 = (int) e1.Key;
3206 int i2 = (int) e2.Key;
3211 class StringDictionaryValueComparer : IComparer
3213 public static readonly StringDictionaryValueComparer Instance
3214 = new StringDictionaryValueComparer ();
3216 private StringDictionaryValueComparer ()
3220 public int Compare (object o1, object o2)
3222 DictionaryEntry e1 = (DictionaryEntry) o1;
3223 DictionaryEntry e2 = (DictionaryEntry) o2;
3224 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3227 int i1 = (int) e1.Key;
3228 int i2 = (int) e2.Key;
3233 class UCAComparer : IComparer
3235 public static readonly UCAComparer Instance
3236 = new UCAComparer ();
3238 private UCAComparer ()
3242 public int Compare (object o1, object o2)
3244 char i1 = (char) o1;
3245 char i2 = (char) o2;
3247 int l1 = CollationElementTable.GetSortKeyCount (i1);
3248 int l2 = CollationElementTable.GetSortKeyCount (i2);
3249 int l = l1 > l2 ? l2 : l1;
3251 for (int i = 0; i < l; i++) {
3252 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3253 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3254 int v = k1.Primary - k2.Primary;
3257 v = k1.Secondary - k2.Secondary;
3260 v = k1.Thirtiary - k2.Thirtiary;
3263 v = k1.Quarternary - k2.Quarternary;
3276 ArrayList items = new ArrayList ();
3278 public Tailoring (int lcid)
3283 public Tailoring (int lcid, int alias)
3290 get { return lcid; }
3294 get { return alias; }
3297 public bool FrenchSort {
3298 get { return frenchSort; }
3299 set { frenchSort = value; }
3302 public void AddDiacriticalMap (byte target, byte replace)
3304 items.Add (new DiacriticalMap (target, replace));
3307 public void AddSortKeyMap (string source, byte [] sortkey)
3309 items.Add (new SortKeyMap (source, sortkey));
3312 public void AddReplacementMap (string source, string replace)
3314 items.Add (new ReplacementMap (source, replace));
3317 public char [] ItemToCharArray ()
3319 ArrayList al = new ArrayList ();
3320 foreach (ITailoringMap m in items)
3321 al.AddRange (m.ToCharArray ());
3322 return al.ToArray (typeof (char)) as char [];
3325 interface ITailoringMap
3327 char [] ToCharArray ();
3330 class DiacriticalMap : ITailoringMap
3332 public readonly byte Target;
3333 public readonly byte Replace;
3335 public DiacriticalMap (byte target, byte replace)
3341 public char [] ToCharArray ()
3343 char [] ret = new char [3];
3344 ret [0] = (char) 02; // kind:DiacriticalMap
3345 ret [1] = (char) Target;
3346 ret [2] = (char) Replace;
3351 class SortKeyMap : ITailoringMap
3353 public readonly string Source;
3354 public readonly byte [] SortKey;
3356 public SortKeyMap (string source, byte [] sortkey)
3362 public char [] ToCharArray ()
3364 char [] ret = new char [Source.Length + 7];
3365 ret [0] = (char) 01; // kind:SortKeyMap
3366 for (int i = 0; i < Source.Length; i++)
3367 ret [i + 1] = Source [i];
3369 for (int i = 0; i < 5; i++)
3370 ret [i + Source.Length + 2] = (char) SortKey [i];
3375 class ReplacementMap : ITailoringMap
3377 public readonly string Source;
3378 public readonly string Replace;
3380 public ReplacementMap (string source, string replace)
3386 public char [] ToCharArray ()
3388 char [] ret = new char [Source.Length + Replace.Length + 3];
3389 ret [0] = (char) 03; // kind:ReplaceMap
3391 for (int i = 0; i < Source.Length; i++)
3392 ret [pos++] = Source [i];
3395 for (int i = 0; i < Replace.Length; i++)
3396 ret [pos++] = Replace [i];