3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
35 namespace Mono.Globalization.Unicode
37 internal class MSCompatSortKeyTableGenerator
39 public static void Main (string [] args)
41 new MSCompatSortKeyTableGenerator ().Run (args);
44 const int DecompositionWide = 1; // fixed
45 const int DecompositionSub = 2; // fixed
46 const int DecompositionSmall = 3;
47 const int DecompositionIsolated = 4;
48 const int DecompositionInitial = 5;
49 const int DecompositionFinal = 6;
50 const int DecompositionMedial = 7;
51 const int DecompositionNoBreak = 8;
52 const int DecompositionVertical = 9;
53 const int DecompositionFraction = 0xA;
54 const int DecompositionFont = 0xB;
55 const int DecompositionSuper = 0xC; // fixed
56 const int DecompositionFull = 0xE;
57 const int DecompositionNarrow = 0xD;
58 const int DecompositionCircle = 0xF;
59 const int DecompositionSquare = 0x10;
60 const int DecompositionCompat = 0x11;
61 const int DecompositionCanonical = 0x12;
63 TextWriter Result = Console.Out;
65 byte [] fillIndex = new byte [256]; // by category
66 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68 char [] specialIgnore = new char [] {
69 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
73 // FIXME: need more love (as always)
74 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77 '\u0292', '\u01BE', '\u0298'};
78 byte [] alphaWeights = new byte [] {
79 2, 9, 0xA, 0x1A, 0x21,
80 0x23, 0x25, 0x2C, 0x32, 0x35,
81 0x36, 0x48, 0x51, 0x70, 0x7C,
82 0x7E, 0x89, 0x8A, 0x91, 0x99,
83 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84 0xA9, 0xAA, 0xB3, 0xB4};
86 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87 bool [] isUppercase = new bool [char.MaxValue + 1];
89 byte [] decompType = new byte [char.MaxValue + 1];
90 int [] decompIndex = new int [char.MaxValue + 1];
91 int [] decompLength = new int [char.MaxValue + 1];
93 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95 byte [] diacritical = new byte [char.MaxValue + 1];
97 string [] diacritics = new string [] {
99 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102 " OGONEK;", " CEDILLA;",
103 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104 " STROKE;", " CIRCUMFLEX AND ACUTE;",
105 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106 " DIAERESIS AND GRAVE;",
108 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109 " MACRON AND ACUTE;",
110 " MACRON AND GRAVE;",
111 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112 " RING ABOVE AND ACUTE",
113 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114 " CIRCUMFLEX AND TILDE",
115 " TILDE AND DIAERESIS",
118 " CEDILLA AND BREVE",
119 " OGONEK AND MACRON",
120 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
123 " PRECEDED BY APOSTROPHE",
125 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
128 " RETROFLEX;", "DIAERESIS BELOW",
130 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131 " BREVE BELOW;", " HORN AND GRAVE",
133 " DOT BELOW AND DOT ABOVE",
134 " RIGHT HALF RING", " HORN AND TILDE",
135 " CIRCUMFLEX AND DOT BELOW",
136 " BREVE AND DOT BELOW",
137 " DOT BELOW AND MACRON",
138 " HORN AND HOOK ABOVE",
140 // CIRCLED, PARENTHESIZED and so on
141 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
144 byte [] diacriticWeights = new byte [] {
146 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147 0x17, 0x19, 0x1A, 0x1B, 0x1C,
148 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
155 0x69, 0x69, 0x6A, 0x6D, 0x6E,
157 // CIRCLED, PARENTHESIZED and so on.
158 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
161 int [] numberSecondaryWeightBounds = new int [] {
162 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166 0xE50, 0xE60, 0xED0, 0xEE0
169 char [] orderedCyrillic;
170 char [] orderedGurmukhi;
171 char [] orderedGujarati;
172 char [] orderedGeorgian;
173 char [] orderedThaana;
175 static readonly char [] orderedTamilConsonants = new char [] {
176 // based on traditional Tamil consonants, except for
177 // Grantha (where Microsoft breaks traditionalism).
178 // http://www.angelfire.com/empire/thamizh/padanGaL
179 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
185 // cp -> character name (only for some characters)
186 ArrayList sortableCharNames = new ArrayList ();
188 // cp -> arrow value (int)
189 ArrayList arrowValues = new ArrayList ();
191 // cp -> box value (int)
192 ArrayList boxValues = new ArrayList ();
194 // cp -> level1 value
195 Hashtable arabicLetterPrimaryValues = new Hashtable ();
198 Hashtable arabicNameMap = new Hashtable ();
200 // cp -> Hashtable [decompType] -> cp
201 Hashtable nfkdMap = new Hashtable ();
203 // Latin letter -> ArrayList [int]
204 Hashtable latinMap = new Hashtable ();
206 ArrayList jisJapanese = new ArrayList ();
207 ArrayList nonJisJapanese = new ArrayList ();
209 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
210 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
211 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
212 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
213 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
215 byte [] ignorableFlags = new byte [char.MaxValue + 1];
217 static double [] unicodeAge = new double [char.MaxValue + 1];
219 ArrayList tailorings = new ArrayList ();
221 void Run (string [] args)
223 string dirname = args.Length == 0 ? "downloaded" : args [0];
224 ParseSources (dirname);
225 Console.Error.WriteLine ("parse done.");
227 ModifyParsedValues ();
229 Console.Error.WriteLine ("generation done.");
231 Console.Error.WriteLine ("serialization done.");
233 StreamWriter sw = new StreamWriter ("agelog.txt");
234 for (int i = 0; i < char.MaxValue; i++) {
235 bool shouldBe = false;
236 switch (Char.GetUnicodeCategory ((char) i)) {
237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
238 shouldBe = true; break;
240 if (unicodeAge [i] >= 3.1)
242 //if (IsIgnorable (i) != shouldBe)
243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
249 byte [] CompressArray (byte [] source, CodePointIndexer i)
251 return (byte []) CodePointIndexer.CompressArray (
252 source, typeof (byte), i);
255 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
257 return (ushort []) CodePointIndexer.CompressArray (
258 source, typeof (ushort), i);
264 SerializeTailorings ();
266 byte [] categories = new byte [map.Length];
267 byte [] level1 = new byte [map.Length];
268 byte [] level2 = new byte [map.Length];
269 byte [] level3 = new byte [map.Length];
270 int [] widthCompat = new int [map.Length];
271 for (int i = 0; i < map.Length; i++) {
272 categories [i] = map [i].Category;
273 level1 [i] = map [i].Level1;
274 level2 [i] = map [i].Level2;
275 level3 [i] = ComputeLevel3Weight ((char) i);
276 switch (decompType [i]) {
277 case DecompositionNarrow:
278 case DecompositionWide:
279 case DecompositionSuper:
280 case DecompositionSub:
281 // they are always 1 char
282 widthCompat [i] = decompValues [decompIndex [i]];
288 ignorableFlags = CompressArray (ignorableFlags,
289 MSCompatUnicodeTableUtil.Ignorable);
290 categories = CompressArray (categories,
291 MSCompatUnicodeTableUtil.Category);
292 level1 = CompressArray (level1,
293 MSCompatUnicodeTableUtil.Level1);
294 level2 = CompressArray (level2,
295 MSCompatUnicodeTableUtil.Level2);
296 level3 = CompressArray (level3,
297 MSCompatUnicodeTableUtil.Level3);
298 widthCompat = (int []) CodePointIndexer.CompressArray (
299 widthCompat, typeof (int),
300 MSCompatUnicodeTableUtil.WidthCompat);
301 cjkCHS = CompressArray (cjkCHS,
302 MSCompatUnicodeTableUtil.CjkCHS);
303 cjkCHT = CompressArray (cjkCHT,
304 MSCompatUnicodeTableUtil.Cjk);
305 cjkJA = CompressArray (cjkJA,
306 MSCompatUnicodeTableUtil.Cjk);
307 cjkKO = CompressArray (cjkKO,
308 MSCompatUnicodeTableUtil.Cjk);
309 cjkKOlv2 = CompressArray (cjkKOlv2,
310 MSCompatUnicodeTableUtil.Cjk);
313 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
314 for (int i = 0; i < ignorableFlags.Length; i++) {
315 byte value = ignorableFlags [i];
317 Result.Write ("{0},", value);
319 Result.Write ("0x{0:X02},", value);
320 if ((i & 0xF) == 0xF)
321 Result.WriteLine ("// {0:X04}", i - 0xF);
323 Result.WriteLine ("};");
327 Result.WriteLine ("static byte [] categories = new byte [] {");
328 for (int i = 0; i < categories.Length; i++) {
329 byte value = categories [i];
331 Result.Write ("{0},", value);
333 Result.Write ("0x{0:X02},", value);
334 if ((i & 0xF) == 0xF)
335 Result.WriteLine ("// {0:X04}", i - 0xF);
337 Result.WriteLine ("};");
340 // Primary weight value
341 Result.WriteLine ("static byte [] level1 = new byte [] {");
342 for (int i = 0; i < level1.Length; i++) {
343 byte value = level1 [i];
345 Result.Write ("{0},", value);
347 Result.Write ("0x{0:X02},", value);
348 if ((i & 0xF) == 0xF)
349 Result.WriteLine ("// {0:X04}", i - 0xF);
351 Result.WriteLine ("};");
355 Result.WriteLine ("static byte [] level2 = new byte [] {");
356 for (int i = 0; i < level2.Length; i++) {
357 int value = level2 [i];
359 Result.Write ("{0},", value);
361 Result.Write ("0x{0:X02},", value);
362 if ((i & 0xF) == 0xF)
363 Result.WriteLine ("// {0:X04}", i - 0xF);
365 Result.WriteLine ("};");
369 Result.WriteLine ("static byte [] level3 = new byte [] {");
370 for (int i = 0; i < level3.Length; i++) {
371 byte value = level3 [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
376 if ((i & 0xF) == 0xF)
377 Result.WriteLine ("// {0:X04}", i - 0xF);
379 Result.WriteLine ("};");
382 // Width insensitivity mappings
383 // (for now it is more lightweight than dumping the
384 // entire NFKD table).
385 Result.WriteLine ("static int [] widthCompat = new int [] {");
386 for (int i = 0; i < widthCompat.Length; i++) {
387 int value = widthCompat [i];
389 Result.Write ("{0},", value);
391 Result.Write ("0x{0:X02},", value);
392 if ((i & 0xF) == 0xF)
393 Result.WriteLine ("// {0:X04}", i - 0xF);
395 Result.WriteLine ("};");
399 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
400 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
401 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
402 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
403 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
406 void SerializeCJK (string name, ushort [] cjk, int max)
408 int offset = 0;//char.MaxValue - cjk.Length;
409 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
410 for (int i = 0; i < cjk.Length; i++) {
411 if (i + offset == max)
413 ushort value = cjk [i];
415 Result.Write ("{0},", value);
417 Result.Write ("0x{0:X04},", value);
418 if ((i & 0xF) == 0xF)
419 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
421 Result.WriteLine ("};");
425 void SerializeCJK (string name, byte [] cjk, int max)
427 int offset = 0;//char.MaxValue - cjk.Length;
428 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
429 for (int i = 0; i < cjk.Length; i++) {
430 if (i + offset == max)
432 byte value = cjk [i];
434 Result.Write ("{0},", value);
436 Result.Write ("0x{0:X02},", value);
437 if ((i & 0xF) == 0xF)
438 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
440 Result.WriteLine ("};");
444 void SerializeTailorings ()
446 Hashtable indexes = new Hashtable ();
447 Hashtable counts = new Hashtable ();
448 Result.WriteLine ("static char [] tailorings = new char [] {");
450 foreach (Tailoring t in tailorings) {
453 Result.Write ("/*{0}*/", t.LCID);
454 indexes.Add (t.LCID, count);
455 char [] values = t.ItemToCharArray ();
456 counts.Add (t.LCID, values.Length);
457 foreach (char c in values) {
458 Result.Write ("'\\x{0:X}', ", (int) c);
459 if (++count % 16 == 0)
460 Result.WriteLine (" // {0:X04}", count - 16);
463 Result.WriteLine ("};");
465 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
466 foreach (Tailoring t in tailorings) {
467 int target = t.Alias != 0 ? t.Alias : t.LCID;
468 if (!indexes.ContainsKey (target)) {
469 Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
472 int idx = (int) indexes [target];
473 int cnt = (int) counts [target];
474 bool french = t.FrenchSort;
476 foreach (Tailoring t2 in tailorings)
477 if (t2.LCID == t.LCID)
478 french = t2.FrenchSort;
479 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
481 Result.WriteLine ("};");
486 void ParseSources (string dirname)
489 dirname + "/UnicodeData.txt";
490 string derivedCoreProps =
491 dirname + "/DerivedCoreProperties.txt";
493 dirname + "/Scripts.txt";
495 dirname + "/CP932.TXT";
497 dirname + "/DerivedAge.txt";
498 string chXML = dirname + "/common/collation/zh.xml";
499 string jaXML = dirname + "/common/collation/ja.xml";
500 string koXML = dirname + "/common/collation/ko.xml";
502 ParseDerivedAge (derivedAge);
506 ParseJISOrder (cp932); // in prior to ParseUnidata()
507 ParseUnidata (unidata);
508 ParseDerivedCoreProperties (derivedCoreProps);
509 ParseScripts (scripts);
510 ParseCJK (chXML, jaXML, koXML);
512 ParseTailorings ("mono-tailoring-source.txt");
515 void ParseTailorings (string filename)
519 using (StreamReader sr = new StreamReader (filename)) {
521 while (sr.Peek () >= 0) {
523 ProcessTailoringLine (ref t,
524 sr.ReadLine ().Trim ());
526 } catch (Exception) {
527 Console.Error.WriteLine ("ERROR at line {0}", line);
533 // For now this is enough.
534 string ParseTailoringSourceValue (string s)
536 StringBuilder sb = new StringBuilder ();
537 for (int i = 0; i < s.Length; i++) {
538 if (s.StartsWith ("\\u")) {
539 sb.Append ((char) int.Parse (
540 s.Substring (2, 4), NumberStyles.HexNumber),
547 return sb.ToString ();
550 void ProcessTailoringLine (ref Tailoring t, string s)
552 int idx = s.IndexOf ('#');
554 s = s.Substring (0, idx).Trim ();
555 if (s.Length == 0 || s [0] == '#')
558 idx = s.IndexOf ('=');
561 int.Parse (s.Substring (1, idx - 1)),
562 int.Parse (s.Substring (idx + 1)));
564 t = new Tailoring (int.Parse (s.Substring (1)));
568 if (s.StartsWith ("*FrenchSort")) {
572 string d = "*Diacritical";
573 if (s.StartsWith (d)) {
574 idx = s.IndexOf ("->");
575 t.AddDiacriticalMap (
576 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
577 NumberStyles.HexNumber),
578 byte.Parse (s.Substring (idx + 2).Trim (),
579 NumberStyles.HexNumber));
582 idx = s.IndexOf (':');
584 string source = s.Substring (0, idx).Trim ();
585 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
586 byte [] b = new byte [5];
587 for (int i = 0; i < 5; i++) {
591 b [i] = byte.Parse (l [i],
592 NumberStyles.HexNumber);
594 t.AddSortKeyMap (ParseTailoringSourceValue (source),
597 idx = s.IndexOf ('=');
599 t.AddReplacementMap (
600 ParseTailoringSourceValue (
601 s.Substring (0, idx).Trim ()),
602 ParseTailoringSourceValue (
603 s.Substring (idx + 1).Trim ()));
606 void ParseDerivedAge (string filename)
608 using (StreamReader file =
609 new StreamReader (filename)) {
610 while (file.Peek () >= 0) {
611 string s = file.ReadLine ();
612 int idx = s.IndexOf ('#');
614 s = s.Substring (0, idx);
615 idx = s.IndexOf (';');
619 string cpspec = s.Substring (0, idx);
620 idx = cpspec.IndexOf ("..");
621 NumberStyles nf = NumberStyles.HexNumber |
622 NumberStyles.AllowTrailingWhite;
623 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
624 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
625 string value = s.Substring (cpspec.Length + 1).Trim ();
628 if (cp > char.MaxValue)
631 for (int i = cp; i <= cpEnd; i++)
632 unicodeAge [i] = double.Parse (value);
635 unicodeAge [0] = double.MaxValue; // never be supported
638 void ParseUnidata (string filename)
640 ArrayList decompValues = new ArrayList ();
641 using (StreamReader unidata =
642 new StreamReader (filename)) {
643 for (int line = 1; unidata.Peek () >= 0; line++) {
645 ProcessUnidataLine (unidata.ReadLine (), decompValues);
646 } catch (Exception) {
647 Console.Error.WriteLine ("**** At line " + line);
652 this.decompValues = (int [])
653 decompValues.ToArray (typeof (int));
656 void ProcessUnidataLine (string s, ArrayList decompValues)
658 int idx = s.IndexOf ('#');
660 s = s.Substring (0, idx);
661 idx = s.IndexOf (';');
664 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
665 string [] values = s.Substring (idx + 1).Split (';');
668 if (cp > char.MaxValue)
670 if (IsIgnorable (cp))
673 string name = values [0];
676 if (s.IndexOf ("SMALL CAPITAL") > 0)
677 isSmallCapital [cp] = true;
679 // latin mapping by character name
680 if (s.IndexOf ("LATIN") > 0) {
681 int lidx = s.IndexOf ("LETTER DOTLESS ");
682 int offset = lidx + 15;
684 lidx = s.IndexOf ("LETTER TURNED ");
688 lidx = s.IndexOf ("LETTER ");
691 char c = lidx > 0 ? s [offset] : char.MinValue;
692 if ('A' <= c && c <= 'Z' &&
693 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
694 ArrayList entry = (ArrayList) latinMap [c];
696 entry = new ArrayList ();
697 latinMap [c] = entry;
704 if (0x2000 <= cp && cp < 0x3000) {
706 // SPECIAL CASES. FIXME: why?
708 case 0x21C5: value = -1; break; // E2
709 case 0x261D: value = 1; break;
710 case 0x27A6: value = 3; break;
711 case 0x21B0: value = 7; break;
712 case 0x21B1: value = 3; break;
713 case 0x21B2: value = 7; break;
714 case 0x21B4: value = 5; break;
715 case 0x21B5: value = 7; break;
716 case 0x21B9: value = -1; break; // E1
717 case 0x21CF: value = 7; break;
718 case 0x21D0: value = 3; break;
720 string [] arrowTargets = new string [] {
732 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
733 if (s.IndexOf (arrowTargets [i]) > 0 &&
734 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
735 s.IndexOf (" OVER") < 0
739 arrowValues.Add (new DictionaryEntry (
744 if (0x2500 <= cp && cp < 0x25B0) {
747 // up:1 down:2 right:4 left:8 vert:16 horiz:32
750 // [dr] [dl] [ur] [ul]
754 ArrayList flags = new ArrayList (new int [] {
757 4 + 2, 8 + 2, 4 + 1, 8 + 1,
758 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
759 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
760 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
762 byte [] offsets = new byte [] {
769 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
771 if (s.IndexOf (" UP") > 0)
773 if (s.IndexOf (" DOWN") > 0)
775 if (s.IndexOf (" RIGHT") > 0)
777 if (s.IndexOf (" LEFT") > 0)
779 if (s.IndexOf (" VERTICAL") > 0)
781 if (s.IndexOf (" HORIZONTAL") > 0)
784 int fidx = flags.IndexOf (flag);
785 value = fidx < 0 ? fidx : offsets [fidx];
786 } else if (s.IndexOf ("BLOCK") > 0) {
787 if (s.IndexOf ("ONE EIGHTH") > 0)
789 else if (s.IndexOf ("ONE QUARTER") > 0)
791 else if (s.IndexOf ("THREE EIGHTHS") > 0)
793 else if (s.IndexOf ("HALF") > 0)
795 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
797 else if (s.IndexOf ("THREE QUARTERS") > 0)
799 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
805 boxValues.Add (new DictionaryEntry (
809 // For some characters store the name and sort later
810 // to determine sorting.
811 if (0x2100 <= cp && cp <= 0x213F &&
812 Char.IsSymbol ((char) cp))
813 sortableCharNames.Add (
814 new DictionaryEntry (cp, values [0]));
815 else if (0x3380 <= cp && cp <= 0x33DD)
816 sortableCharNames.Add (new DictionaryEntry (
817 cp, values [0].Substring (7)));
819 // diacritical weights by character name
820 for (int d = 0; d < diacritics.Length; d++)
821 if (s.IndexOf (diacritics [d]) > 0)
822 diacritical [cp] |= diacriticWeights [d];
823 // Two-step grep required for it.
824 if (s.IndexOf ("FULL STOP") > 0 &&
825 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
826 diacritical [cp] |= 0xF4;
828 // Arabic letter name
829 if (0x0621 <= cp && cp <= 0x064A &&
830 Char.GetUnicodeCategory ((char) cp)
831 == UnicodeCategory.OtherLetter) {
832 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
837 // hamza, waw, yeh ... special cases.
842 value = 0x77; // special cases.
845 // Get primary letter name i.e.
846 // XXX part of ARABIC LETTER XXX yyy
847 // e.g. that of "TEH MARBUTA" is "TEH".
850 // 0x0640 is special: it does
851 // not start with ARABIC LETTER
853 values [0].Substring (14);
854 int tmpIdx = letterName.IndexOf (' ');
855 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
856 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
857 if (arabicNameMap.ContainsKey (letterName))
858 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
860 arabicNameMap [letterName] = cp;
863 arabicLetterPrimaryValues [cp] = value;
866 // Japanese square letter
867 if (0x3300 <= cp && cp <= 0x3357)
869 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
872 string decomp = values [4];
873 idx = decomp.IndexOf ('<');
875 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
877 decompType [cp] = DecompositionFull;
880 decompType [cp] = DecompositionSub;
883 decompType [cp] = DecompositionSuper;
886 decompType [cp] = DecompositionSmall;
889 decompType [cp] = DecompositionIsolated;
892 decompType [cp] = DecompositionInitial;
895 decompType [cp] = DecompositionFinal;
898 decompType [cp] = DecompositionMedial;
901 decompType [cp] = DecompositionNoBreak;
904 decompType [cp] = DecompositionCompat;
907 decompType [cp] = DecompositionFraction;
910 decompType [cp] = DecompositionFont;
913 decompType [cp] = DecompositionCircle;
916 decompType [cp] = DecompositionSquare;
919 decompType [cp] = DecompositionWide;
922 decompType [cp] = DecompositionNarrow;
925 decompType [cp] = DecompositionVertical;
928 throw new Exception ("Support NFKD type : " + decomp);
932 decompType [cp] = DecompositionCanonical;
933 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
934 if (decomp.Length > 0) {
936 string [] velems = decomp.Split (' ');
937 int didx = decompValues.Count;
938 decompIndex [cp] = didx;
939 foreach (string v in velems)
940 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
941 decompLength [cp] = velems.Length;
943 // [decmpType] -> this_cp
944 int targetCP = (int) decompValues [didx];
945 // for "(x)" it specially maps to 'x' .
946 // FIXME: check if it is sane
947 if (velems.Length == 3 &&
948 (int) decompValues [didx] == '(' &&
949 (int) decompValues [didx + 2] == ')')
950 targetCP = (int) decompValues [didx + 1];
951 // special: 0x215F "1/"
952 else if (cp == 0x215F)
954 else if (velems.Length > 1 &&
955 (targetCP < 0x4C00 || 0x9FBB < targetCP))
956 // skip them, except for CJK ideograph compat
960 Hashtable entry = (Hashtable) nfkdMap [targetCP];
962 entry = new Hashtable ();
963 nfkdMap [targetCP] = entry;
965 entry [(byte) decompType [cp]] = cp;
969 if (values [5].Length > 0)
970 decimalValue [cp] = decimal.Parse (values [5]);
971 else if (values [6].Length > 0)
972 decimalValue [cp] = decimal.Parse (values [6]);
973 else if (values [7].Length > 0) {
974 string decstr = values [7];
975 idx = decstr.IndexOf ('/');
976 if (cp == 0x215F) // special. "1/"
977 decimalValue [cp] = 0x1;
981 decimal.Parse (decstr.Substring (0, idx))
982 / decimal.Parse (decstr.Substring (idx + 1));
983 else if (decstr [0] == '(' &&
984 decstr [decstr.Length - 1] == ')')
987 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
988 else if (decstr [decstr.Length - 1] == '.')
991 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
993 decimalValue [cp] = decimal.Parse (decstr);
997 void ParseDerivedCoreProperties (string filename)
1000 using (StreamReader file =
1001 new StreamReader (filename)) {
1002 for (int line = 1; file.Peek () >= 0; line++) {
1004 ProcessDerivedCorePropLine (file.ReadLine ());
1005 } catch (Exception) {
1006 Console.Error.WriteLine ("**** At line " + line);
1013 void ProcessDerivedCorePropLine (string s)
1015 int idx = s.IndexOf ('#');
1017 s = s.Substring (0, idx);
1018 idx = s.IndexOf (';');
1021 string cpspec = s.Substring (0, idx);
1022 idx = cpspec.IndexOf ("..");
1023 NumberStyles nf = NumberStyles.HexNumber |
1024 NumberStyles.AllowTrailingWhite;
1025 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1026 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1027 string value = s.Substring (cpspec.Length + 1).Trim ();
1030 if (cp > char.MaxValue)
1035 for (int x = cp; x <= cpEnd; x++)
1036 isUppercase [x] = true;
1041 void ParseScripts (string filename)
1043 ArrayList cyrillic = new ArrayList ();
1044 ArrayList gurmukhi = new ArrayList ();
1045 ArrayList gujarati = new ArrayList ();
1046 ArrayList georgian = new ArrayList ();
1047 ArrayList thaana = new ArrayList ();
1049 using (StreamReader file =
1050 new StreamReader (filename)) {
1051 while (file.Peek () >= 0) {
1052 string s = file.ReadLine ();
1053 int idx = s.IndexOf ('#');
1055 s = s.Substring (0, idx);
1056 idx = s.IndexOf (';');
1060 string cpspec = s.Substring (0, idx);
1061 idx = cpspec.IndexOf ("..");
1062 NumberStyles nf = NumberStyles.HexNumber |
1063 NumberStyles.AllowTrailingWhite;
1064 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1065 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1066 string value = s.Substring (cpspec.Length + 1).Trim ();
1069 if (cp > char.MaxValue)
1074 for (int x = cp; x <= cpEnd; x++)
1075 if (!IsIgnorable (x))
1076 cyrillic.Add ((char) x);
1079 for (int x = cp; x <= cpEnd; x++)
1080 if (!IsIgnorable (x))
1081 gurmukhi.Add ((char) x);
1084 for (int x = cp; x <= cpEnd; x++)
1085 if (!IsIgnorable (x))
1086 gujarati.Add ((char) x);
1089 for (int x = cp; x <= cpEnd; x++)
1090 if (!IsIgnorable (x))
1091 georgian.Add ((char) x);
1094 for (int x = cp; x <= cpEnd; x++)
1095 if (!IsIgnorable (x))
1096 thaana.Add ((char) x);
1101 cyrillic.Sort (UCAComparer.Instance);
1102 gurmukhi.Sort (UCAComparer.Instance);
1103 gujarati.Sort (UCAComparer.Instance);
1104 georgian.Sort (UCAComparer.Instance);
1105 thaana.Sort (UCAComparer.Instance);
1106 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1107 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1108 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1109 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1110 orderedThaana = (char []) thaana.ToArray (typeof (char));
1113 void ParseJISOrder (string filename)
1115 using (StreamReader file =
1116 new StreamReader (filename)) {
1117 while (file.Peek () >= 0) {
1118 string s = file.ReadLine ();
1119 int idx = s.IndexOf ('#');
1121 s = s.Substring (0, idx).Trim ();
1124 idx = s.IndexOf (' ');
1127 // They start with "0x" so cut them out.
1128 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1129 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1130 jisJapanese.Add (new JISCharacter (cp, jis));
1135 void ParseCJK (string zhXML, string jaXML, string koXML)
1137 XmlDocument doc = new XmlDocument ();
1138 doc.XmlResolver = null;
1145 // Chinese Simplified
1148 offset = 0;//char.MaxValue - arr.Length;
1150 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1152 foreach (char c in s) {
1154 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1156 arr [(int) c - offset] = (ushort) v++;
1162 // Chinese Traditional
1165 offset = 0;//char.MaxValue - arr.Length;
1166 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1168 foreach (char c in s) {
1170 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1172 arr [(int) c - offset] = (ushort) v++;
1181 offset = 0;//char.MaxValue - arr.Length;
1183 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1185 foreach (char c in s) {
1187 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1189 arr [(int) c - offset] = (ushort) v++;
1196 // Korean weight is somewhat complex. It first shifts
1197 // Hangul category from 52-x to 80-x (they are anyways
1198 // computed). CJK ideographs are placed at secondary
1199 // weight, like XX YY 01 zz 01, where XX and YY are
1200 // corresponding "reset" value and zz is 41,43,45...
1202 // Unlike chs,cht and ja, Korean value is a combined
1203 // ushort which is computed as category
1207 offset = 0;//char.MaxValue - arr.Length;
1209 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1210 XmlElement sc = (XmlElement) reset.NextSibling;
1211 // compute "category" and "level 1" for the
1212 // target "reset" Hangle syllable
1213 char rc = reset.InnerText [0];
1214 int ri = ((int) rc - 0xAC00) + 1;
1216 ((ri / 254) * 256 + (ri % 254) + 2);
1217 // Place the characters after the target.
1220 foreach (char c in s) {
1221 arr [(int) c - offset] = p;
1222 cjkKOlv2 [(int) c - offset] = (byte) v;
1232 void FillIgnorables ()
1234 for (int i = 0; i <= char.MaxValue; i++) {
1235 if (Char.GetUnicodeCategory ((char) i) ==
1236 UnicodeCategory.OtherNotAssigned)
1238 if (IsIgnorable (i))
1239 ignorableFlags [i] |= 1;
1240 if (IsIgnorableSymbol (i))
1241 ignorableFlags [i] |= 2;
1242 if (IsIgnorableNonSpacing (i))
1243 ignorableFlags [i] |= 4;
1247 void ModifyParsedValues ()
1249 // number, secondary weights
1251 int [] numarr = numberSecondaryWeightBounds;
1252 for (int i = 0; i < numarr.Length; i += 2, weight++)
1253 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1254 if (Char.IsNumber ((char) cp))
1255 diacritical [cp] = weight;
1257 // Modify some decomposition equivalence
1258 decompType [0xFE31] = 0;
1259 decompIndex [0xFE31] = 0;
1260 decompLength [0xFE31] = 0;
1261 decompType [0xFE32] = 0;
1262 decompIndex [0xFE32] = 0;
1263 decompLength [0xFE32] = 0;
1265 // Korean parens numbers
1266 for (int i = 0x3200; i <= 0x321C; i++)
1267 diacritical [i] = 0xA;
1268 for (int i = 0x3260; i <= 0x327B; i++)
1269 diacritical [i] = 0xC;
1271 // Update name part of named characters
1272 for (int i = 0; i < sortableCharNames.Count; i++) {
1273 DictionaryEntry de =
1274 (DictionaryEntry) sortableCharNames [i];
1275 int cp = (int) de.Key;
1276 string renamed = null;
1278 case 0x2101: renamed = "A_1"; break;
1279 case 0x33C3: renamed = "A_2"; break;
1280 case 0x2105: renamed = "C_1"; break;
1281 case 0x2106: renamed = "C_2"; break;
1282 case 0x211E: renamed = "R1"; break;
1283 case 0x211F: renamed = "R2"; break;
1284 // Remove some of them!
1295 sortableCharNames.RemoveAt (i);
1299 if (renamed != null)
1300 sortableCharNames [i] =
1301 new DictionaryEntry (cp, renamed);
1305 void GenerateCore ()
1309 #region Specially ignored // 01
1310 // This will raise "Defined" flag up.
1311 foreach (char c in specialIgnore)
1312 map [(int) c] = new CharMapEntry (0, 0, 0);
1316 #region Variable weights
1317 // Controls : 06 03 - 06 3D
1319 for (int i = 0; i < 65536; i++) {
1320 if (IsIgnorable (i))
1323 uc = Char.GetUnicodeCategory (c);
1324 // NEL is whitespace but not ignored here.
1325 if (uc == UnicodeCategory.Control &&
1326 !Char.IsWhiteSpace (c) || c == '\u0085')
1327 AddCharMap (c, 6, 1);
1331 fillIndex [6] = 0x80;
1332 AddCharMapGroup ('\'', 6, 1, 0);
1333 AddCharMap ('\uFE63', 6, 1);
1335 // Hyphen/Dash : 06 81 - 06 90
1336 for (int i = 0; i < char.MaxValue; i++) {
1337 if (!IsIgnorable (i) &&
1338 Char.GetUnicodeCategory ((char) i) ==
1339 UnicodeCategory.DashPunctuation) {
1340 AddCharMapGroup2 ((char) i, 6, 1, 0);
1342 // SPECIAL: add 2027 and 2043
1343 // Maybe they are regarded the
1344 // same hyphens in "central"
1346 AddCharMap ('\u2027', 6, 1);
1347 AddCharMap ('\u2043', 6, 1);
1352 // Arabic variable weight chars 06 A0 -
1353 fillIndex [6] = 0xA0;
1355 for (int i = 0x64B; i <= 0x650; i++)
1356 AddArabicCharMap ((char) i);
1358 AddCharMapGroup ('\u0652', 6, 1, 0);
1360 AddCharMapGroup ('\u0651', 6, 1, 0);
1364 #region Nonspacing marks // 01
1365 // FIXME: 01 03 - 01 B6 ... annoyance :(
1367 // Combining diacritical marks: 01 DC -
1369 fillIndex [0x1] = 0x41;
1370 for (int i = 0x030E; i <= 0x0326; i++)
1371 if (!IsIgnorable (i))
1372 AddCharMap ((char) i, 0x1, 1);
1373 for (int i = 0x0329; i <= 0x0334; i++)
1374 if (!IsIgnorable (i))
1375 AddCharMap ((char) i, 0x1, 1);
1376 for (int i = 0x0339; i <= 0x0341; i++)
1377 if (!IsIgnorable (i))
1378 AddCharMap ((char) i, 0x1, 1);
1379 fillIndex [0x1] = 0x72;
1380 for (int i = 0x0346; i <= 0x0348; i++)
1381 if (!IsIgnorable (i))
1382 AddCharMap ((char) i, 0x1, 1);
1383 for (int i = 0x02BE; i <= 0x02BF; i++)
1384 if (!IsIgnorable (i))
1385 AddCharMap ((char) i, 0x1, 1);
1386 for (int i = 0x02C1; i <= 0x02C5; i++)
1387 if (!IsIgnorable (i))
1388 AddCharMap ((char) i, 0x1, 1);
1389 for (int i = 0x02CE; i <= 0x02CF; i++)
1390 if (!IsIgnorable (i))
1391 AddCharMap ((char) i, 0x1, 1);
1392 for (int i = 0x02D1; i <= 0x02D3; i++)
1393 if (!IsIgnorable (i))
1394 AddCharMap ((char) i, 0x1, 1);
1395 AddCharMap ('\u02DE', 0x1, 1);
1396 for (int i = 0x02E4; i <= 0x02E9; i++)
1397 if (!IsIgnorable (i))
1398 AddCharMap ((char) i, 0x1, 1);
1400 // LAMESPEC: It should not stop at '\u20E1'. There are
1401 // a few more characters (that however results in
1402 // overflow of level 2 unless we start before 0xDD).
1403 fillIndex [0x1] = 0xDC;
1404 for (int i = 0x20d0; i <= 0x20e1; i++)
1405 AddCharMap ((char) i, 0x1, 1);
1409 #region Whitespaces // 07 03 -
1410 fillIndex [0x7] = 0x2;
1411 AddCharMap (' ', 0x7, 2);
1412 AddCharMap ('\u00A0', 0x7, 1);
1413 for (int i = 9; i <= 0xD; i++)
1414 AddCharMap ((char) i, 0x7, 1);
1415 for (int i = 0x2000; i <= 0x200B; i++)
1416 AddCharMap ((char) i, 0x7, 1);
1418 fillIndex [0x7] = 0x17;
1419 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1420 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1422 // Characters which used to represent layout control.
1423 // LAMESPEC: Windows developers seem to have thought
1424 // that those characters are kind of whitespaces,
1425 // while they aren't.
1426 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1427 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1430 // FIXME: 09 should be more complete.
1431 fillIndex [0x9] = 2;
1433 for (int cp = 0x2300; cp <= 0x237A; cp++)
1434 AddCharMap ((char) cp, 0x9, 1, 0);
1437 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1438 foreach (DictionaryEntry de in arrowValues) {
1439 int idx = (int) de.Value;
1440 int cp = (int) de.Key;
1441 if (map [cp].Defined)
1443 fillIndex [0x9] = (byte) (0xD8 + idx);
1444 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1448 byte [] boxLv2 = new byte [128];
1449 for (int i = 0; i < boxLv2.Length; i++)
1451 foreach (DictionaryEntry de in boxValues) {
1452 int cp = (int) de.Key;
1453 int idx = (int) de.Value;
1454 if (map [cp].Defined)
1456 fillIndex [0x9] = (byte) (0xE5 + idx);
1457 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1460 // Some special characters (slanted)
1461 fillIndex [0x9] = 0xF4;
1462 AddCharMap ('\u2571', 0x9, 3);
1463 AddCharMap ('\u2572', 0x9, 3);
1464 AddCharMap ('\u2573', 0x9, 3);
1466 // FIXME: implement 0A
1468 fillIndex [0xA] = 2;
1469 // byte currency symbols
1470 for (int cp = 0; cp < 0x100; cp++) {
1471 uc = Char.GetUnicodeCategory ((char) cp);
1472 if (!IsIgnorable (cp) &&
1473 uc == UnicodeCategory.CurrencySymbol &&
1475 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1477 // byte other symbols
1478 for (int cp = 0; cp < 0x100; cp++) {
1480 continue; // SPECIAL: skip FIXME: why?
1481 uc = Char.GetUnicodeCategory ((char) cp);
1482 if (!IsIgnorable (cp) &&
1483 uc == UnicodeCategory.OtherSymbol)
1484 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1487 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1488 for (int cp = 0x2600; cp <= 0x2613; cp++)
1489 AddCharMap ((char) cp, 0xA, 1, 0);
1491 for (int cp = 0x2620; cp <= 0x2770; cp++)
1492 if (Char.IsSymbol ((char) cp))
1493 AddCharMap ((char) cp, 0xA, 1, 0);
1495 for (int i = 0x2440; i < 0x2460; i++)
1496 AddCharMap ((char) i, 0xA, 1, 0);
1500 #region Numbers // 0C 02 - 0C E1
1501 fillIndex [0xC] = 2;
1503 // 9F8 : Bengali "one less than the denominator"
1504 AddCharMap ('\u09F8', 0xC, 1);
1506 ArrayList numbers = new ArrayList ();
1507 for (int i = 0; i < 65536; i++)
1508 if (!IsIgnorable (i) &&
1509 Char.IsNumber ((char) i) &&
1510 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1513 ArrayList numberValues = new ArrayList ();
1514 foreach (int i in numbers)
1515 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1516 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1518 //foreach (DictionaryEntry de in numberValues)
1519 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1521 decimal prevValue = -1;
1522 foreach (DictionaryEntry de in numberValues) {
1523 int cp = (int) de.Key;
1524 decimal currValue = (decimal) de.Value;
1525 bool addnew = false;
1526 if (prevValue < currValue &&
1527 prevValue - (int) prevValue == 0 &&
1531 // Process Hangzhou and Roman numbers
1533 // There are some SPECIAL cases.
1534 if (currValue != 4) // no increment for 4
1538 xcp = (int) prevValue + 0x2170 - 1;
1539 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1540 xcp = (int) prevValue + 0x2160 - 1;
1541 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1542 fillIndex [0xC] += 2;
1543 xcp = (int) prevValue + 0x3021 - 1;
1544 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1547 if (prevValue < currValue)
1548 prevValue = currValue;
1549 if (map [cp].Defined)
1551 // HangZhou and Roman are add later
1553 else if (0x3021 <= cp && cp < 0x302A
1554 || 0x2160 <= cp && cp < 0x216A
1555 || 0x2170 <= cp && cp < 0x217A)
1558 if (cp == 0x215B) // FIXME: why?
1559 fillIndex [0xC] += 2;
1560 else if (cp == 0x3021) // FIXME: why?
1562 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1564 if (addnew || cp <= '9') {
1566 if (1 <= currValue && currValue <= 10) {
1567 xcp = cp - 0x31 + 0x2776;
1568 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1569 xcp = cp - 0x31 + 0x2780;
1570 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1571 xcp = cp - 0x31 + 0x278A;
1572 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1574 if (1 <= currValue && currValue <= 20) {
1575 xcp = cp - 0x31 + 0x2460;
1576 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1577 xcp = cp - 0x31 + 0x2474;
1578 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1579 xcp = cp - 0x31 + 0x2488;
1580 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1584 if (cp != 0x09E7 && cp != 0x09EA)
1587 // Add special cases that are not regarded as
1588 // numbers in UnicodeCategory speak.
1591 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1592 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1594 else if (cp == '6') // FIXME: why?
1599 fillIndex [0xC] = 0xFF;
1600 AddCharMap ('\u221E', 0xC, 1);
1603 #region Letters and NonSpacing Marks (general)
1605 // ASCII Latin alphabets
1606 for (int i = 0; i < alphabets.Length; i++)
1607 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1610 // non-ASCII Latin alphabets
1611 // FIXME: there is no such characters that are placed
1612 // *after* "alphabets" array items. This is nothing
1613 // more than a hack that creates dummy weight for
1614 // primary characters.
1615 for (int i = 0x0080; i < 0x0300; i++) {
1616 if (!Char.IsLetter ((char) i))
1618 // For those Latin Letters which has NFKD are
1619 // not added as independent primary character.
1620 if (decompIndex [i] != 0)
1623 // 1.some alphabets have primarily
1624 // equivalent ASCII alphabets.
1625 // 2.some have independent primary weights,
1626 // but inside a-to-z range.
1627 // 3.there are some expanded characters that
1628 // are not part of Unicode Standard NFKD.
1630 // 1. skipping them does not make sense
1631 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1632 // case 0x184: case 0x185: case 0x186: case 0x189:
1633 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1634 // case 0x194: case 0x195: case 0x196: case 0x19A:
1635 // case 0x19B: case 0x19C:
1636 // 2. skipping them does not make sense
1637 // case 0x14A: // Ng
1638 // case 0x14B: // ng
1642 case 0xDE: // Icelandic Thorn
1643 case 0xFE: // Icelandic Thorn
1644 case 0xDF: // German ss
1645 case 0xFF: // German ss
1646 // not classified yet
1647 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1648 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1649 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1650 // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1654 AddCharMapGroup ((char) i, 0xE, 1, 0);
1658 fillIndex [0xF] = 02;
1659 for (int i = 0x0380; i < 0x0390; i++)
1660 if (Char.IsLetter ((char) i))
1661 AddLetterMap ((char) i, 0xF, 1);
1662 fillIndex [0xF] = 02;
1663 for (int i = 0x0391; i < 0x03CF; i++)
1664 if (Char.IsLetter ((char) i))
1665 AddLetterMap ((char) i, 0xF, 1);
1666 fillIndex [0xF] = 0x40;
1667 for (int i = 0x03D0; i < 0x0400; i++)
1668 if (Char.IsLetter ((char) i))
1669 AddLetterMap ((char) i, 0xF, 1);
1671 // Cyrillic - UCA order w/ some modification
1672 fillIndex [0x10] = 0x3;
1673 // table which is moslty from UCA DUCET.
1674 for (int i = 0; i < orderedCyrillic.Length; i++) {
1675 char c = orderedCyrillic [i];
1676 if (Char.IsLetter (c))
1677 AddLetterMap (c, 0x10, 3);
1679 for (int i = 0x0460; i < 0x0481; i++) {
1680 if (Char.IsLetter ((char) i))
1681 AddLetterMap ((char) i, 0x10, 3);
1685 fillIndex [0x11] = 0x3;
1686 for (int i = 0x0531; i < 0x0586; i++)
1687 if (Char.IsLetter ((char) i))
1688 AddLetterMap ((char) i, 0x11, 1);
1692 fillIndex [0x12] = 0x3;
1693 for (int i = 0x05D0; i < 0x05FF; i++)
1694 if (Char.IsLetter ((char) i))
1695 AddLetterMap ((char) i, 0x12, 1);
1697 fillIndex [0x1] = 0x3;
1698 for (int i = 0x0591; i <= 0x05C2; i++)
1700 AddCharMap ((char) i, 0x1, 1);
1703 fillIndex [0x1] = 0x8E;
1704 fillIndex [0x13] = 0x3;
1705 for (int i = 0x0621; i <= 0x064A; i++) {
1707 if (Char.GetUnicodeCategory ((char) i)
1708 != UnicodeCategory.OtherLetter) {
1709 // FIXME: arabic nonspacing marks are
1710 // in different order.
1711 AddCharMap ((char) i, 0x1, 1);
1714 // map [i] = new CharMapEntry (0x13,
1715 // (byte) arabicLetterPrimaryValues [i], 1);
1717 (byte) arabicLetterPrimaryValues [i];
1718 AddLetterMap ((char) i, 0x13, 0);
1720 fillIndex [0x13] = 0x84;
1721 for (int i = 0x0674; i < 0x06D6; i++)
1722 if (Char.IsLetter ((char) i))
1723 AddLetterMap ((char) i, 0x13, 1);
1726 // FIXME: it does seem straight codepoint mapping.
1727 fillIndex [0x14] = 04;
1728 for (int i = 0x0901; i < 0x0905; i++)
1729 if (!IsIgnorable (i))
1730 AddLetterMap ((char) i, 0x14, 2);
1731 fillIndex [0x14] = 0xB;
1732 for (int i = 0x0905; i < 0x093A; i++)
1733 if (Char.IsLetter ((char) i))
1734 AddLetterMap ((char) i, 0x14, 4);
1735 for (int i = 0x093E; i < 0x094F; i++)
1736 if (!IsIgnorable (i))
1737 AddLetterMap ((char) i, 0x14, 2);
1741 fillIndex [0x15] = 02;
1742 for (int i = 0x0980; i < 0x9FF; i++) {
1743 if (IsIgnorable (i))
1746 fillIndex [0x15] = 0x3B;
1747 switch (Char.GetUnicodeCategory ((char) i)) {
1748 case UnicodeCategory.NonSpacingMark:
1749 case UnicodeCategory.DecimalDigitNumber:
1750 case UnicodeCategory.OtherNumber:
1753 AddLetterMap ((char) i, 0x15, 1);
1756 fillIndex [0x1] = 0x3;
1757 for (int i = 0x0981; i < 0x0A00; i++)
1758 if (Char.GetUnicodeCategory ((char) i) ==
1759 UnicodeCategory.NonSpacingMark)
1760 AddCharMap ((char) i, 0x1, 1);
1762 // Gurmukhi. orderedGurmukhi is from UCA
1763 // FIXME: it does not look equivalent to UCA.
1764 fillIndex [0x1] = 03;
1765 fillIndex [0x16] = 02;
1766 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1767 char c = orderedGurmukhi [i];
1768 if (IsIgnorable ((int) c))
1770 if (!Char.IsLetter (c)) {
1771 AddLetterMap (c, 0x1, 1);
1774 if (c == '\u0A3C' || c == '\u0A4D' ||
1775 '\u0A66' <= c && c <= '\u0A71')
1777 AddLetterMap (c, 0x16, 4);
1780 // Gujarati. orderedGujarati is from UCA
1781 fillIndex [0x17] = 02;
1782 for (int i = 0; i < orderedGujarati.Length; i++)
1783 AddLetterMap (orderedGujarati [i], 0x17, 4);
1786 fillIndex [0x18] = 02;
1787 for (int i = 0x0B00; i < 0x0B7F; i++) {
1788 switch (Char.GetUnicodeCategory ((char) i)) {
1789 case UnicodeCategory.NonSpacingMark:
1790 case UnicodeCategory.DecimalDigitNumber:
1793 AddLetterMap ((char) i, 0x18, 1);
1797 fillIndex [0x19] = 2;
1798 AddCharMap ('\u0BD7', 0x19, 0);
1799 fillIndex [0x19] = 0xA;
1801 for (int i = 0x0BD7; i < 0x0B94; i++)
1802 if (Char.IsLetter ((char) i))
1803 AddCharMap ((char) i, 0x19, 2);
1805 fillIndex [0x19] = 0x24;
1806 AddCharMap ('\u0B94', 0x19, 0);
1807 fillIndex [0x19] = 0x26;
1808 // The array for Tamil consonants is a constant.
1809 // Windows have almost similar sequence to TAM from
1810 // tamilnet but a bit different in Grantha.
1811 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1812 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1814 fillIndex [0x19] = 0x82;
1815 for (int i = 0x0BBE; i < 0x0BCD; i++)
1816 if (Char.GetUnicodeCategory ((char) i) ==
1817 UnicodeCategory.SpacingCombiningMark
1819 AddLetterMap ((char) i, 0x19, 2);
1822 fillIndex [0x1A] = 0x4;
1823 for (int i = 0x0C00; i < 0x0C62; i++) {
1824 if (i == 0x0C55 || i == 0x0C56)
1826 AddCharMap ((char) i, 0x1A, 3);
1827 char supp = (i == 0x0C0B) ? '\u0C60':
1828 i == 0x0C0C ? '\u0C61' : char.MinValue;
1829 if (supp == char.MinValue)
1831 AddCharMap (supp, 0x1A, 3);
1835 fillIndex [0x1B] = 4;
1836 for (int i = 0x0C80; i < 0x0CE5; i++) {
1837 if (i == 0x0CD5 || i == 0x0CD6)
1839 AddCharMap ((char) i, 0x1B, 3);
1843 fillIndex [0x1C] = 2;
1844 for (int i = 0x0D02; i < 0x0D61; i++)
1845 // FIXME: I avoided MSCompatUnicodeTable usage
1846 // here (it results in recursion). So check if
1847 // using NonSpacingMark makes sense or not.
1848 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1849 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1850 AddCharMap ((char) i, 0x1C, 1);
1852 // Thai ... note that it breaks 0x1E wall after E2B!
1853 // Also, all Thai characters have level 2 value 3.
1854 fillIndex [0x1E] = 2;
1855 for (int i = 0xE44; i < 0xE48; i++)
1856 AddCharMap ((char) i, 0x1E, 1, 3);
1857 for (int i = 0xE01; i < 0xE2B; i++)
1858 AddCharMap ((char) i, 0x1E, 6, 0);
1859 fillIndex [0x1F] = 5;
1860 for (int i = 0xE2B; i < 0xE30; i++)
1861 AddCharMap ((char) i, 0x1F, 6, 0);
1862 for (int i = 0xE30; i < 0xE3B; i++)
1863 AddCharMap ((char) i, 0x1F, 1, 3);
1864 // some Thai characters remains.
1865 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1866 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1867 foreach (char c in specialThai)
1868 AddCharMap (c, 0x1F, 1);
1871 fillIndex [0x1F] = 2;
1872 for (int i = 0xE80; i < 0xEDF; i++)
1873 if (Char.IsLetter ((char) i))
1874 AddCharMap ((char) i, 0x1F, 1);
1876 // Georgian. orderedGeorgian is from UCA DUCET.
1877 fillIndex [0x21] = 5;
1878 for (int i = 0; i < orderedGeorgian.Length; i++)
1879 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1882 fillIndex [0x22] = 2;
1883 int kanaOffset = 0x3041;
1884 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1886 for (int gyo = 0; gyo < 9; gyo++) {
1887 for (int dan = 0; dan < 5; dan++) {
1888 if (gyo == 7 && dan % 2 == 1) {
1891 kanaOffset -= 2; // There is no space for yi and ye.
1894 int cp = kanaOffset + dan * kanaLines [gyo];
1895 // small lines (a-gyo, ya-gyo)
1896 if (gyo == 0 || gyo == 7) {
1897 AddKanaMap (cp, 1); // small
1898 AddKanaMap (cp + 1, 1);
1901 AddKanaMap (cp, kanaLines [gyo]);
1905 // add small 'Tsu' (before normal one)
1906 AddKanaMap (0x3063, 1);
1910 fillIndex [0x22] += 3;
1911 kanaOffset += 5 * kanaLines [gyo];
1914 // Wa-gyo is almost special, so I just manually add.
1915 AddLetterMap ((char) 0x308E, 0x22, 0);
1916 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1917 AddLetterMap ((char) 0x308F, 0x22, 0);
1918 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1920 AddLetterMap ((char) 0x3090, 0x22, 0);
1921 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1922 fillIndex [0x22] += 2;
1923 // no "Wu" in Japanese.
1924 AddLetterMap ((char) 0x3091, 0x22, 0);
1925 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1927 AddLetterMap ((char) 0x3092, 0x22, 0);
1928 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1930 fillIndex [0x22] = 0x80;
1931 AddLetterMap ((char) 0x3093, 0x22, 0);
1932 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1934 // JIS Japanese square chars.
1935 fillIndex [0x22] = 0x97;
1936 jisJapanese.Sort (JISComparer.Instance);
1937 foreach (JISCharacter j in jisJapanese)
1938 AddCharMap ((char) j.CP, 0x22, 1);
1939 // non-JIS Japanese square chars.
1940 nonJisJapanese.Sort (NonJISComparer.Instance);
1941 foreach (NonJISCharacter j in nonJisJapanese)
1942 AddCharMap ((char) j.CP, 0x22, 1);
1945 fillIndex [0x23] = 0x02;
1946 for (int i = 0x3105; i <= 0x312C; i++)
1947 AddCharMap ((char) i, 0x23, 1);
1949 // Estrangela: ancient Syriac
1950 fillIndex [0x24] = 0x0B;
1951 // FIXME: is 0x71E really alternative form?
1952 ArrayList syriacAlternatives = new ArrayList (
1953 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1954 for (int i = 0x0710; i <= 0x072C; i++) {
1955 if (i == 0x0711) // NonSpacingMark
1957 if (syriacAlternatives.Contains (i))
1959 AddCharMap ((char) i, 0x24, 4);
1964 foreach (int cp in syriacAlternatives)
1965 map [cp] = new CharMapEntry (0x24,
1966 (byte) (map [cp - 1].Level1 + 2),
1970 // FIXME: it turned out that it does not look like UCA
1971 fillIndex [0x24] = 0x6E;
1972 for (int i = 0; i < orderedThaana.Length; i++) {
1973 if (IsIgnorableNonSpacing (i))
1975 AddCharMap (orderedThaana [i], 0x24, 2);
1979 // FIXME: Add more culture-specific letters (that are
1980 // not supported in Windows collation) here.
1982 // Surrogate ... they are computed.
1987 // Unlike UCA Windows Hangul sequence mixes Jongseong
1988 // with Choseong sequence as well as Jungseong,
1989 // adjusted to have the same primary weight for the
1990 // same base character. So it is impossible to compute
1993 // Here I introduce an ordered sequence of mixed
1994 // 'commands' and 'characters' that is similar to
1996 // - ',' increases primary weight.
1997 // - [A B] means a range, increasing index
1998 // - {A B} means a range, without increasing index
1999 // - '=' is no operation (it means the characters
2000 // of both sides have the same weight).
2001 // - '>' inserts a Hangul Syllable block that
2002 // contains 0x251 characters.
2003 // - '<' decreases the index
2004 // - '0'-'9' means skip count
2005 // - whitespaces are ignored
2008 string hangulSequence =
2009 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2010 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2011 + "<{\u1113 \u1116}, \u3165,"
2012 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2013 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2014 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2015 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2016 + "[\u11D1 \u11D2], \u11B2,"
2017 + "[\u11D3 \u11D5], \u11B3,"
2018 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2019 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2020 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2021 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2022 + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2023 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2024 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2025 + "\u11EA,, \u110A=\u11BB,,, >"
2026 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2027 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2028 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2029 + "\u11F1,, \u11F2,,,"
2030 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2031 + "<\u114D, \u110D,, >"
2032 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2033 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2034 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2035 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2036 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2040 byte hangulCat = 0x52;
2041 fillIndex [hangulCat] = 0x2;
2043 int syllableBlock = 0;
2044 for (int n = 0; n < hangulSequence.Length; n++) {
2045 char c = hangulSequence [n];
2047 if (Char.IsWhiteSpace (c))
2053 IncrementSequentialIndex (ref hangulCat);
2056 if (fillIndex [hangulCat] == 2)
2057 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2058 fillIndex [hangulCat]--;
2061 IncrementSequentialIndex (ref hangulCat);
2062 for (int l = 0; l < 0x15; l++)
2063 for (int v = 0; v < 0x1C; v++) {
2065 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2066 IncrementSequentialIndex (ref hangulCat);
2071 start = hangulSequence [n + 1];
2072 end = hangulSequence [n + 3];
2073 for (int i = start; i <= end; i++) {
2074 AddCharMap ((char) i, hangulCat, 0);
2076 IncrementSequentialIndex (ref hangulCat);
2078 n += 4; // consumes 5 characters for this operation
2081 start = hangulSequence [n + 1];
2082 end = hangulSequence [n + 3];
2083 for (int i = start; i <= end; i++)
2084 AddCharMap ((char) i, hangulCat, 0);
2085 n += 4; // consumes 5 characters for this operation
2088 AddCharMap (c, hangulCat, 0);
2094 for (int i = 0x3200; i < 0x3300; i++) {
2095 if (IsIgnorable (i) || map [i].Defined)
2099 if (decompLength [i] == 4 &&
2100 decompValues [decompIndex [i]] == '(')
2101 ch = decompIndex [i] + 1;
2103 else if (decompLength [i] == 2 &&
2104 decompValues [decompIndex [i] + 1] == '\u1161')
2105 ch = decompIndex [i];
2106 else if (decompLength [i] == 1)
2107 ch = decompIndex [i];
2110 ch = decompValues [ch];
2111 if (ch < 0x1100 || 0x1200 < ch &&
2112 ch < 0xAC00 || 0xD800 < ch)
2114 map [i] = new CharMapEntry (map [ch].Category,
2115 (byte) (map [ch].Level1 + 1),
2117 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2123 // Letterlike characters and CJK compatibility square
2124 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2125 int [] counts = new int ['Z' - 'A' + 1];
2126 char [] namedChars = new char [sortableCharNames.Count];
2128 foreach (DictionaryEntry de in sortableCharNames) {
2129 counts [((string) de.Value) [0] - 'A']++;
2130 namedChars [nCharNames++] = (char) ((int) de.Key);
2132 nCharNames = 0; // reset
2133 for (int a = 0; a < counts.Length; a++) {
2134 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2135 for (int i = 0; i < counts [a]; i++)
2136 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2137 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2140 // CJK unified ideograph.
2142 fillIndex [cjkCat] = 0x2;
2143 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2144 if (!IsIgnorable (cp))
2145 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2146 // CJK Extensions goes here.
2147 // LAMESPEC: With this Windows style CJK layout, it is
2148 // impossible to add more CJK ideograph i.e. 0x9FA6-
2149 // 0x9FBB can never be added w/o breaking compat.
2150 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2151 if (!IsIgnorable (cp))
2152 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2154 // PrivateUse ... computed.
2155 // remaining Surrogate ... computed.
2157 #region Special "biggest" area (FF FF)
2158 fillIndex [0xFF] = 0xFF;
2159 char [] specialBiggest = new char [] {
2160 '\u3005', '\u3031', '\u3032', '\u309D',
2161 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2162 '\uFE7C', '\uFE7D', '\uFF70'};
2163 foreach (char c in specialBiggest)
2164 AddCharMap (c, 0xFF, 0);
2167 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2168 // non-alphanumeric ASCII except for: + - < = > '
2169 for (int i = 0x21; i < 0x7F; i++) {
2170 if (Char.IsLetterOrDigit ((char) i)
2171 || "+-<=>'".IndexOf ((char) i) >= 0)
2172 continue; // they are not added here.
2173 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2174 // Insert 3001 after ',' and 3002 after '.'
2176 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2177 else if (i == 0x2E) {
2179 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2182 AddCharMap ('\uFE30', 0x7, 1, 0);
2186 #region 07 - Punctuations and something else
2187 for (int i = 0xA0; i < char.MaxValue; i++) {
2188 if (IsIgnorable (i))
2200 switch (Char.GetUnicodeCategory ((char) i)) {
2201 case UnicodeCategory.OtherPunctuation:
2202 case UnicodeCategory.ClosePunctuation:
2203 case UnicodeCategory.OpenPunctuation:
2204 case UnicodeCategory.InitialQuotePunctuation:
2205 case UnicodeCategory.FinalQuotePunctuation:
2206 case UnicodeCategory.ModifierSymbol:
2207 // SPECIAL CASES: // 0xA
2208 if (0x2020 <= i && i <= 0x2042)
2210 AddCharMapGroup ((char) i, 0x7, 1, 0);
2213 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2214 goto case UnicodeCategory.OtherPunctuation;
2219 for (int i = 0x2400; i <= 0x2421; i++)
2220 AddCharMap ((char) i, 0x7, 1, 0);
2223 // FIXME: for 07 xx we need more love.
2225 // FIXME: 08 should be more complete.
2226 fillIndex [0x8] = 2;
2227 for (int cp = 0; cp < char.MaxValue; cp++)
2228 if (!map [cp].Defined &&
2229 Char.GetUnicodeCategory ((char) cp) ==
2230 UnicodeCategory.MathSymbol)
2231 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2233 // Characters w/ diacritical marks (NFKD)
2234 for (int i = 0; i <= char.MaxValue; i++) {
2235 if (map [i].Defined || IsIgnorable (i))
2237 if (decompIndex [i] == 0)
2240 int start = decompIndex [i];
2241 int primaryChar = decompValues [start];
2244 int length = decompLength [i];
2245 // special processing for parenthesized ones.
2247 decompValues [start] == '(' &&
2248 decompValues [start + 2] == ')') {
2249 primaryChar = decompValues [start + 1];
2253 if (map [primaryChar].Level1 == 0)
2256 for (int l = 1; l < length; l++) {
2257 int c = decompValues [start + l];
2258 if (map [c].Level1 != 0)
2260 secondary += diacritical [c];
2264 map [i] = new CharMapEntry (
2265 map [primaryChar].Category,
2266 map [primaryChar].Level1,
2271 #region Level2 adjustment
2273 diacritical [0x624] = 0x5;
2274 diacritical [0x626] = 0x7;
2275 diacritical [0x622] = 0x9;
2276 diacritical [0x623] = 0xA;
2277 diacritical [0x625] = 0xB;
2278 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2279 diacritical [0x64A] = 0x7; // Yaa'
2282 for (int i = 0; i < char.MaxValue; i++) {
2284 byte cat = map [i].Category;
2286 case 0xE: // Latin diacritics
2287 case 0x22: // Japanese: circled characters
2288 mod = diacritical [i];
2290 case 0x13: // Arabic
2291 if (diacritical [i] == 0)
2292 mod = 0x8; // default for arabic
2295 if (0x52 <= cat && cat <= 0x7F) // Hangul
2296 mod = diacritical [i];
2298 map [i] = new CharMapEntry (
2299 cat, map [i].Level1, mod);
2303 // FIXME: this is hack but those which are
2304 // NonSpacingMark characters and still undefined
2305 // are likely to be nonspacing.
2306 for (int i = 0; i < char.MaxValue; i++)
2307 if (!map [i].Defined &&
2309 Char.GetUnicodeCategory ((char) i) ==
2310 UnicodeCategory.NonSpacingMark)
2311 AddCharMap ((char) i, 1, 1);
2314 private void IncrementSequentialIndex (ref byte hangulCat)
2316 fillIndex [hangulCat]++;
2317 if (fillIndex [hangulCat] == 0) { // overflown
2319 fillIndex [hangulCat] = 0x2;
2323 // Reset fillIndex to fixed value and call AddLetterMap().
2324 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2326 fillIndex [category] = alphaWeight;
2327 AddLetterMap (c, category, 0);
2329 ArrayList al = latinMap [c] as ArrayList;
2333 foreach (int cp in al)
2334 AddLetterMap ((char) cp, category, 0);
2337 private void AddKanaMap (int i, byte voices)
2339 for (byte b = 0; b < voices; b++) {
2340 char c = (char) (i + b);
2341 byte arg = (byte) (b > 0 ? b + 2 : 0);
2343 AddLetterMapCore (c, 0x22, 0, arg);
2345 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2349 private void AddLetterMap (char c, byte category, byte updateCount)
2351 AddLetterMapCore (c, category, updateCount, 0);
2354 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2357 // <small> updates index
2358 c2 = ToSmallForm (c);
2360 AddCharMapGroup (c2, category, updateCount, level2);
2361 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2362 if (c2 != c && !map [(int) c2].Defined)
2363 AddLetterMapCore (c2, category, 0, level2);
2364 bool doUpdate = true;
2365 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2368 AddCharMapGroup (c, category, 0, level2);
2370 fillIndex [category] += updateCount;
2373 private bool AddCharMap (char c, byte category, byte increment)
2375 return AddCharMap (c, category, increment, 0);
2378 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2380 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2381 return false; // do nothing
2382 map [(int) c] = new CharMapEntry (category,
2383 category == 1 ? alt : fillIndex [category],
2384 category == 1 ? fillIndex [category] : alt);
2385 fillIndex [category] += increment;
2389 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2391 char c2 = ToSmallFormTail (c);
2393 AddCharMap (c2, category, updateCount, 0);
2395 AddCharMap (c, category, updateCount, 0);
2397 c2 = ToFullWidthTail (c);
2399 AddCharMapGroupTail (c2, category, updateCount);
2403 // Adds characters to table in the order below
2404 // (+ increases weight):
2408 // <full> | <super> | <sub>
2409 // <circle> | <wide> (| <narrow>)
2413 // level2 is fixed (does not increase).
2414 int [] sameWeightItems = new int [] {
2415 DecompositionFraction,
2419 DecompositionCircle,
2421 DecompositionNarrow,
2423 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2425 if (map [(int) c].Defined)
2428 char small = char.MinValue;
2429 char vertical = char.MinValue;
2430 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2432 object smv = nfkd [(byte) DecompositionSmall];
2434 small = (char) ((int) smv);
2435 object vv = nfkd [(byte) DecompositionVertical];
2437 vertical = (char) ((int) vv);
2440 // <small> updates index
2441 if (small != char.MinValue)
2442 AddCharMap (small, category, updateCount);
2445 AddCharMap (c, category, 0, level2);
2448 foreach (int weight in sameWeightItems) {
2449 object wv = nfkd [(byte) weight];
2451 AddCharMap ((char) ((int) wv), category, 0, level2);
2455 // update index here.
2456 fillIndex [category] += updateCount;
2458 if (vertical != char.MinValue)
2459 AddCharMap (vertical, category, updateCount, level2);
2462 private void AddCharMapCJK (char c, ref byte category)
2464 AddCharMap (c, category, 0, 0);
2465 IncrementSequentialIndex (ref category);
2467 // Special. I wonder why but Windows skips 9E F9.
2468 if (category == 0x9E && fillIndex [category] == 0xF9)
2469 IncrementSequentialIndex (ref category);
2472 private void AddCharMapGroupCJK (char c, ref byte category)
2474 AddCharMapCJK (c, ref category);
2476 // LAMESPEC: see below.
2477 if (c == '\u52DE') {
2478 AddCharMapCJK ('\u3298', ref category);
2479 AddCharMapCJK ('\u3238', ref category);
2482 AddCharMapCJK ('\u32A2', ref category);
2484 // Especially this mapping order totally does
2485 // not make sense to me.
2486 AddCharMapCJK ('\u32A9', ref category);
2488 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2491 for (byte weight = 0; weight <= 0x12; weight++) {
2492 object wv = nfkd [weight];
2497 // Special: they are ignored in this area.
2498 // FIXME: check if it is sane
2499 if (0xF900 <= w && w <= 0xFAD9)
2501 // LAMESPEC: on Windows some of CJK characters
2502 // in 3200-32B0 are incorrectly mapped. They
2503 // mix Chinise and Japanese Kanji when
2504 // ordering those characters.
2506 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2510 AddCharMapCJK ((char) w, ref category);
2514 // For now it is only for 0x7 category.
2515 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2517 char small = char.MinValue;
2518 char vertical = char.MinValue;
2519 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2521 object smv = nfkd [(byte) DecompositionSmall];
2523 small = (char) ((int) smv);
2524 object vv = nfkd [(byte) DecompositionVertical];
2526 vertical = (char) ((int) vv);
2529 // <small> updates index
2530 if (small != char.MinValue)
2531 // SPECIAL CASE excluded (FIXME: why?)
2532 if (small != '\u2024')
2533 AddCharMap (small, category, updateCount);
2536 AddCharMap (c, category, updateCount, level2);
2538 // Since nfkdMap is problematic to have two or more
2539 // NFKD to an identical character, here I iterate all.
2540 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2541 if (decompLength [c2] == 1 &&
2542 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2543 switch (decompType [c2]) {
2544 case DecompositionCompat:
2545 AddCharMap ((char) c2, category, updateCount, level2);
2551 if (vertical != char.MinValue)
2552 // SPECIAL CASE excluded (FIXME: why?)
2553 if (vertical != '\uFE33' && vertical != '\uFE34')
2554 AddCharMap (vertical, category, updateCount, level2);
2557 private void AddArabicCharMap (char c)
2560 byte updateCount = 1;
2564 AddCharMap (c, category, 0, level2);
2566 // Since nfkdMap is problematic to have two or more
2567 // NFKD to an identical character, here I iterate all.
2568 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2569 if (decompLength [c2] == 0)
2571 int idx = decompIndex [c2] + decompLength [c2] - 1;
2572 if ((int) (decompValues [idx]) == (int) c)
2573 AddCharMap ((char) c2, category,
2576 fillIndex [category] += updateCount;
2579 char ToFullWidth (char c)
2581 return ToDecomposed (c, DecompositionFull, false);
2584 char ToFullWidthTail (char c)
2586 return ToDecomposed (c, DecompositionFull, true);
2589 char ToSmallForm (char c)
2591 return ToDecomposed (c, DecompositionSmall, false);
2594 char ToSmallFormTail (char c)
2596 return ToDecomposed (c, DecompositionSmall, true);
2599 char ToDecomposed (char c, byte d, bool tail)
2601 if (decompType [(int) c] != d)
2603 int idx = decompIndex [(int) c];
2605 idx += decompLength [(int) c] - 1;
2606 return (char) decompValues [idx];
2609 bool ExistsJIS (int cp)
2611 foreach (JISCharacter j in jisJapanese)
2619 #region Level 3 properties (Case/Width)
2621 private byte ComputeLevel3Weight (char c)
2623 byte b = ComputeLevel3WeightRaw (c);
2624 return b > 0 ? (byte) (b + 2) : b;
2627 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2630 if ('\u11A8' <= c && c <= '\u11F9')
2632 if ('\uFFA0' <= c && c <= '\uFFDC')
2634 if ('\u3130' <= c && c <= '\u3164')
2637 if ('\u2776' <= c && c <= '\u277F')
2639 if ('\u2780' <= c && c <= '\u2789')
2641 if ('\u2776' <= c && c <= '\u2793')
2643 if ('\u2160' <= c && c <= '\u216F')
2645 if ('\u2181' <= c && c <= '\u2182')
2648 if ('\u2135' <= c && c <= '\u2138')
2650 if ('\uFE80' <= c && c < '\uFE8E') {
2651 // 2(Isolated)/8(Final)/0x18(Medial)
2652 switch (decompType [(int) c]) {
2653 case DecompositionIsolated:
2655 case DecompositionFinal:
2657 case DecompositionMedial:
2662 // actually I dunno the reason why they have weights.
2685 switch (decompType [(int) c]) {
2686 case DecompositionWide: // <wide>
2687 case DecompositionSub: // <sub>
2688 case DecompositionSuper: // <super>
2689 ret |= decompType [(int) c];
2692 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2694 if (isUppercase [(int) c]) // DerivedCoreProperties
2704 static bool IsIgnorable (int i)
2706 if (unicodeAge [i] >= 3.1)
2708 switch (char.GetUnicodeCategory ((char) i)) {
2709 case UnicodeCategory.OtherNotAssigned:
2710 case UnicodeCategory.Format:
2717 // FIXME: In the future use DerivedAge.txt to examine character
2718 // versions and set those ones that have higher version than
2719 // 1.0 as ignorable.
2720 static bool IsIgnorable (int i)
2724 // I guess, those characters are added between
2725 // Unicode 1.0 (LCMapString) and Unicode 3.1
2726 // (UnicodeCategory), so they used to be
2727 // something like OtherNotAssigned as of Unicode 1.1.
2728 case 0x2df: case 0x387:
2729 case 0x3d7: case 0x3d8: case 0x3d9:
2730 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2731 case 0x400: case 0x40d: case 0x450: case 0x45d:
2732 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2733 case 0x653: case 0x654: case 0x655: case 0x66d:
2735 case 0x1e9b: case 0x202f: case 0x20ad:
2736 case 0x20ae: case 0x20af:
2737 case 0x20e2: case 0x20e3:
2738 case 0x2139: case 0x213a: case 0x2183:
2739 case 0x2425: case 0x2426: case 0x2619:
2740 case 0x2670: case 0x2671: case 0x3007:
2741 case 0x3190: case 0x3191:
2742 case 0xfffc: case 0xfffd:
2744 // exceptional characters filtered by the
2745 // following conditions. Originally those exceptional
2746 // ranges are incorrect (they should not be ignored)
2747 // and most of those characters are unfortunately in
2749 case 0x4d8: case 0x4d9:
2750 case 0x4e8: case 0x4e9:
2751 case 0x3036: case 0x303f:
2752 case 0x337b: case 0xfb1e:
2757 // The whole Sinhala characters.
2758 0x0D82 <= i && i <= 0x0DF4
2759 // The whole Tibetan characters.
2760 || 0x0F00 <= i && i <= 0x0FD1
2761 // The whole Myanmar characters.
2762 || 0x1000 <= i && i <= 0x1059
2763 // The whole Etiopic, Cherokee,
2764 // Canadian Syllablic, Ogham, Runic,
2765 // Tagalog, Hanunoo, Philippine,
2766 // Buhid, Tagbanwa, Khmer and Mongorian
2768 || 0x1200 <= i && i <= 0x1DFF
2769 // Greek extension characters.
2770 || 0x1F00 <= i && i <= 0x1FFF
2771 // The whole Braille characters.
2772 || 0x2800 <= i && i <= 0x28FF
2773 // CJK radical characters.
2774 || 0x2E80 <= i && i <= 0x2EF3
2775 // Kangxi radical characters.
2776 || 0x2F00 <= i && i <= 0x2FD5
2777 // Ideographic description characters.
2778 || 0x2FF0 <= i && i <= 0x2FFB
2779 // Bopomofo letter and final
2780 || 0x31A0 <= i && i <= 0x31B7
2781 // White square with quadrant characters.
2782 || 0x25F0 <= i && i <= 0x25F7
2783 // Ideographic telegraph symbols.
2784 || 0x32C0 <= i && i <= 0x32CB
2785 || 0x3358 <= i && i <= 0x3370
2786 || 0x33E0 <= i && i <= 0x33FF
2787 // The whole YI characters.
2788 || 0xA000 <= i && i <= 0xA48C
2789 || 0xA490 <= i && i <= 0xA4C6
2790 // American small ligatures
2791 || 0xFB13 <= i && i <= 0xFB17
2792 // hebrew, arabic, variation selector.
2793 || 0xFB1D <= i && i <= 0xFE2F
2794 // Arabic ligatures.
2795 || 0xFEF5 <= i && i <= 0xFEFC
2796 // FIXME: why are they excluded?
2797 || 0x01F6 <= i && i <= 0x01F9
2798 || 0x0218 <= i && i <= 0x0233
2799 || 0x02A9 <= i && i <= 0x02AD
2800 || 0x02EA <= i && i <= 0x02EE
2801 || 0x0349 <= i && i <= 0x036F
2802 || 0x0488 <= i && i <= 0x048F
2803 || 0x04D0 <= i && i <= 0x04FF
2804 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2805 || 0x06D6 <= i && i <= 0x06ED
2806 || 0x06FA <= i && i <= 0x06FE
2807 || 0x2048 <= i && i <= 0x204D
2808 || 0x20e4 <= i && i <= 0x20ea
2809 || 0x213C <= i && i <= 0x214B
2810 || 0x21EB <= i && i <= 0x21FF
2811 || 0x22F2 <= i && i <= 0x22FF
2812 || 0x237B <= i && i <= 0x239A
2813 || 0x239B <= i && i <= 0x23CF
2814 || 0x24EB <= i && i <= 0x24FF
2815 || 0x2596 <= i && i <= 0x259F
2816 || 0x25F8 <= i && i <= 0x25FF
2817 || 0x2672 <= i && i <= 0x2689
2818 || 0x2768 <= i && i <= 0x2775
2819 || 0x27d0 <= i && i <= 0x27ff
2820 || 0x2900 <= i && i <= 0x2aff
2821 || 0x3033 <= i && i <= 0x303F
2822 || 0x31F0 <= i && i <= 0x31FF
2823 || 0x3250 <= i && i <= 0x325F
2824 || 0x32B1 <= i && i <= 0x32BF
2825 || 0x3371 <= i && i <= 0x337B
2826 || 0xFA30 <= i && i <= 0xFA6A
2830 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2832 case UnicodeCategory.PrivateUse:
2833 case UnicodeCategory.Surrogate:
2835 // ignored by nature
2836 case UnicodeCategory.Format:
2837 case UnicodeCategory.OtherNotAssigned:
2844 // To check IsIgnorable sanity, try the driver below under MS.NET.
2847 public static void Main ()
2849 for (int i = 0; i <= char.MaxValue; i++)
2850 Dump (i, IsIgnorable (i));
2853 static void Dump (int i, bool ignore)
2855 switch (Char.GetUnicodeCategory ((char) i)) {
2856 case UnicodeCategory.PrivateUse:
2857 case UnicodeCategory.Surrogate:
2858 return; // check nothing
2862 string s2 = new string ((char) i, 10);
2863 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2864 if ((ret == 0) == ignore)
2866 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2869 #endregion // IsIgnorable
2871 #region IsIgnorableSymbol
2872 static bool IsIgnorableSymbol (int i)
2874 if (IsIgnorable (i))
2879 case 0x00b5: case 0x01C0: case 0x01C1:
2880 case 0x01C2: case 0x01C3: case 0x01F6:
2881 case 0x01F7: case 0x01F8: case 0x01F9:
2882 case 0x02D0: case 0x02EE: case 0x037A:
2883 case 0x03D7: case 0x03F3:
2884 case 0x0400: case 0x040d:
2885 case 0x0450: case 0x045d:
2886 case 0x048C: case 0x048D:
2887 case 0x048E: case 0x048F:
2888 case 0x0587: case 0x0640: case 0x06E5:
2889 case 0x06E6: case 0x06FA: case 0x06FB:
2890 case 0x06FC: case 0x093D: case 0x0950:
2891 case 0x1E9B: case 0x2139: case 0x3006:
2892 case 0x3033: case 0x3034: case 0x3035:
2893 case 0xFE7E: case 0xFE7F:
2895 case 0x16EE: case 0x16EF: case 0x16F0:
2897 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2898 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2899 case 0x3038: // HANGZHOU NUMERAL TEN
2900 case 0x3039: // HANGZHOU NUMERAL TWENTY
2901 case 0x303a: // HANGZHOU NUMERAL THIRTY
2907 case 0x02B9: case 0x02BA: case 0x02C2:
2908 case 0x02C3: case 0x02C4: case 0x02C5:
2909 case 0x02C8: case 0x02CC: case 0x02CD:
2910 case 0x02CE: case 0x02CF: case 0x02D2:
2911 case 0x02D3: case 0x02D4: case 0x02D5:
2912 case 0x02D6: case 0x02D7: case 0x02DE:
2913 case 0x02E5: case 0x02E6: case 0x02E7:
2914 case 0x02E8: case 0x02E9:
2915 case 0x309B: case 0x309C:
2917 case 0x055A: // American Apos
2918 case 0x05C0: // Hebrew Punct
2919 case 0x0E4F: // Thai FONGMAN
2920 case 0x0E5A: // Thai ANGKHANKHU
2921 case 0x0E5B: // Thai KHOMUT
2923 case 0x09F2: // Bengali Rupee Mark
2924 case 0x09F3: // Bengali Rupee Sign
2926 case 0x221e: // INF.
2935 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2937 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2938 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2943 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2945 case UnicodeCategory.Surrogate:
2946 return false; // inconsistent
2948 case UnicodeCategory.SpacingCombiningMark:
2949 case UnicodeCategory.EnclosingMark:
2950 case UnicodeCategory.NonSpacingMark:
2951 case UnicodeCategory.PrivateUse:
2953 if (0x064B <= i && i <= 0x0652) // Arabic
2957 case UnicodeCategory.Format:
2958 case UnicodeCategory.OtherNotAssigned:
2965 // latin in a circle
2966 0x249A <= i && i <= 0x24E9
2967 || 0x2100 <= i && i <= 0x2132
2969 || 0x3196 <= i && i <= 0x31A0
2971 || 0x3200 <= i && i <= 0x321C
2973 || 0x322A <= i && i <= 0x3243
2975 || 0x3260 <= i && i <= 0x32B0
2976 || 0x32D0 <= i && i <= 0x3357
2977 || 0x337B <= i && i <= 0x33DD
2979 use = !Char.IsLetterOrDigit ((char) i);
2983 // This "Digit" rule is mystery.
2984 // It filters some symbols out.
2985 if (Char.IsLetterOrDigit ((char) i))
2987 if (Char.IsNumber ((char) i))
2989 if (Char.IsControl ((char) i)
2990 || Char.IsSeparator ((char) i)
2991 || Char.IsPunctuation ((char) i))
2993 if (Char.IsSymbol ((char) i))
2996 // FIXME: should check more
3001 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3003 public static void Main ()
3005 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3006 for (int i = 0; i <= char.MaxValue; i++) {
3007 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3008 if (uc == UnicodeCategory.Surrogate)
3011 bool ret = IsIgnorableSymbol (i);
3013 string s1 = "TEST ";
3014 string s2 = "TEST " + (char) i;
3016 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3018 if (ret != (result == 0))
3019 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3020 ret ? "should not ignore" :
3029 static bool IsIgnorableNonSpacing (int i)
3031 if (IsIgnorable (i))
3035 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3036 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3037 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3039 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3040 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3041 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3042 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3043 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3044 case 0x0CCD: case 0x0E4E:
3048 if (0x02b9 <= i && i <= 0x02c5
3049 || 0x02cc <= i && i <= 0x02d7
3050 || 0x02e4 <= i && i <= 0x02ef
3051 || 0x20DD <= i && i <= 0x20E0
3055 if (0x064B <= i && i <= 0x00652
3056 || 0x0941 <= i && i <= 0x0948
3057 || 0x0AC1 <= i && i <= 0x0ACD
3058 || 0x0C3E <= i && i <= 0x0C4F
3059 || 0x0E31 <= i && i <= 0x0E3F
3063 return Char.GetUnicodeCategory ((char) i) ==
3064 UnicodeCategory.NonSpacingMark;
3067 // We can reuse IsIgnorableSymbol testcode
3068 // for IsIgnorableNonSpacing.
3074 public byte Category;
3076 public byte Level2; // It is always single byte.
3077 public bool Defined;
3079 public CharMapEntry (byte category, byte level1, byte level2)
3081 Category = category;
3090 public readonly int CP;
3091 public readonly int JIS;
3093 public JISCharacter (int cp, int cpJIS)
3100 class JISComparer : IComparer
3102 public static readonly JISComparer Instance =
3105 public int Compare (object o1, object o2)
3107 JISCharacter j1 = (JISCharacter) o1;
3108 JISCharacter j2 = (JISCharacter) o2;
3109 return j2.JIS - j1.JIS;
3113 class NonJISCharacter
3115 public readonly int CP;
3116 public readonly string Name;
3118 public NonJISCharacter (int cp, string name)
3125 class NonJISComparer : IComparer
3127 public static readonly NonJISComparer Instance =
3128 new NonJISComparer ();
3130 public int Compare (object o1, object o2)
3132 NonJISCharacter j1 = (NonJISCharacter) o1;
3133 NonJISCharacter j2 = (NonJISCharacter) o2;
3134 return string.CompareOrdinal (j1.Name, j2.Name);
3138 class DecimalDictionaryValueComparer : IComparer
3140 public static readonly DecimalDictionaryValueComparer Instance
3141 = new DecimalDictionaryValueComparer ();
3143 private DecimalDictionaryValueComparer ()
3147 public int Compare (object o1, object o2)
3149 DictionaryEntry e1 = (DictionaryEntry) o1;
3150 DictionaryEntry e2 = (DictionaryEntry) o2;
3151 // FIXME: in case of 0, compare decomposition categories
3152 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3155 int i1 = (int) e1.Key;
3156 int i2 = (int) e2.Key;
3161 class StringDictionaryValueComparer : IComparer
3163 public static readonly StringDictionaryValueComparer Instance
3164 = new StringDictionaryValueComparer ();
3166 private StringDictionaryValueComparer ()
3170 public int Compare (object o1, object o2)
3172 DictionaryEntry e1 = (DictionaryEntry) o1;
3173 DictionaryEntry e2 = (DictionaryEntry) o2;
3174 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3177 int i1 = (int) e1.Key;
3178 int i2 = (int) e2.Key;
3183 class UCAComparer : IComparer
3185 public static readonly UCAComparer Instance
3186 = new UCAComparer ();
3188 private UCAComparer ()
3192 public int Compare (object o1, object o2)
3194 char i1 = (char) o1;
3195 char i2 = (char) o2;
3197 int l1 = CollationElementTable.GetSortKeyCount (i1);
3198 int l2 = CollationElementTable.GetSortKeyCount (i2);
3199 int l = l1 > l2 ? l2 : l1;
3201 for (int i = 0; i < l; i++) {
3202 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3203 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3204 int v = k1.Primary - k2.Primary;
3207 v = k1.Secondary - k2.Secondary;
3210 v = k1.Thirtiary - k2.Thirtiary;
3213 v = k1.Quarternary - k2.Quarternary;
3226 ArrayList items = new ArrayList ();
3228 public Tailoring (int lcid)
3233 public Tailoring (int lcid, int alias)
3240 get { return lcid; }
3244 get { return alias; }
3247 public bool FrenchSort {
3248 get { return frenchSort; }
3249 set { frenchSort = value; }
3252 public void AddDiacriticalMap (byte target, byte replace)
3254 items.Add (new DiacriticalMap (target, replace));
3257 public void AddSortKeyMap (string source, byte [] sortkey)
3259 items.Add (new SortKeyMap (source, sortkey));
3262 public void AddReplacementMap (string source, string replace)
3264 items.Add (new ReplacementMap (source, replace));
3267 public char [] ItemToCharArray ()
3269 ArrayList al = new ArrayList ();
3270 foreach (ITailoringMap m in items)
3271 al.AddRange (m.ToCharArray ());
3272 return al.ToArray (typeof (char)) as char [];
3275 interface ITailoringMap
3277 char [] ToCharArray ();
3280 class DiacriticalMap : ITailoringMap
3282 public readonly byte Target;
3283 public readonly byte Replace;
3285 public DiacriticalMap (byte target, byte replace)
3291 public char [] ToCharArray ()
3293 char [] ret = new char [3];
3294 ret [0] = (char) 02; // kind:DiacriticalMap
3295 ret [1] = (char) Target;
3296 ret [2] = (char) Replace;
3301 class SortKeyMap : ITailoringMap
3303 public readonly string Source;
3304 public readonly byte [] SortKey;
3306 public SortKeyMap (string source, byte [] sortkey)
3312 public char [] ToCharArray ()
3314 char [] ret = new char [Source.Length + 7];
3315 ret [0] = (char) 01; // kind:SortKeyMap
3316 for (int i = 0; i < Source.Length; i++)
3317 ret [i + 1] = Source [i];
3319 for (int i = 0; i < 5; i++)
3320 ret [i + Source.Length + 2] = (char) SortKey [i];
3325 class ReplacementMap : ITailoringMap
3327 public readonly string Source;
3328 public readonly string Replace;
3330 public ReplacementMap (string source, string replace)
3336 public char [] ToCharArray ()
3338 char [] ret = new char [Source.Length + Replace.Length + 3];
3339 ret [0] = (char) 03; // kind:ReplaceMap
3341 for (int i = 0; i < Source.Length; i++)
3342 ret [pos++] = Source [i];
3345 for (int i = 0; i < Replace.Length; i++)
3346 ret [pos++] = Replace [i];