3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
35 namespace Mono.Globalization.Unicode
37 internal class MSCompatSortKeyTableGenerator
39 public static void Main (string [] args)
41 new MSCompatSortKeyTableGenerator ().Run (args);
44 const int DecompositionWide = 1; // fixed
45 const int DecompositionSub = 2; // fixed
46 const int DecompositionSmall = 3;
47 const int DecompositionIsolated = 4;
48 const int DecompositionInitial = 5;
49 const int DecompositionFinal = 6;
50 const int DecompositionMedial = 7;
51 const int DecompositionNoBreak = 8;
52 const int DecompositionVertical = 9;
53 const int DecompositionFraction = 0xA;
54 const int DecompositionFont = 0xB;
55 const int DecompositionSuper = 0xC; // fixed
56 const int DecompositionFull = 0xE;
57 const int DecompositionNarrow = 0xD;
58 const int DecompositionCircle = 0xF;
59 const int DecompositionSquare = 0x10;
60 const int DecompositionCompat = 0x11;
61 const int DecompositionCanonical = 0x12;
63 TextWriter Result = Console.Out;
65 byte [] fillIndex = new byte [256]; // by category
66 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68 char [] specialIgnore = new char [] {
69 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
73 // FIXME: need more love (as always)
74 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77 '\u0292', '\u01BE', '\u0298'};
78 byte [] alphaWeights = new byte [] {
79 2, 9, 0xA, 0x1A, 0x21,
80 0x23, 0x25, 0x2C, 0x32, 0x35,
81 0x36, 0x48, 0x51, 0x70, 0x7C,
82 0x7E, 0x89, 0x8A, 0x91, 0x99,
83 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84 0xA9, 0xAA, 0xB3, 0xB4};
86 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87 bool [] isUppercase = new bool [char.MaxValue + 1];
89 byte [] decompType = new byte [char.MaxValue + 1];
90 int [] decompIndex = new int [char.MaxValue + 1];
91 int [] decompLength = new int [char.MaxValue + 1];
93 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95 byte [] diacritical = new byte [char.MaxValue + 1];
97 string [] diacritics = new string [] {
99 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102 " OGONEK;", " CEDILLA;",
103 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104 " STROKE;", " CIRCUMFLEX AND ACUTE;",
105 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106 " DIAERESIS AND GRAVE;",
108 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109 " MACRON AND ACUTE;",
110 " MACRON AND GRAVE;",
111 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112 " RING ABOVE AND ACUTE",
113 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114 " CIRCUMFLEX AND TILDE",
115 " TILDE AND DIAERESIS",
118 " CEDILLA AND BREVE",
119 " OGONEK AND MACRON",
120 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
123 " PRECEDED BY APOSTROPHE",
125 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
128 " RETROFLEX;", "DIAERESIS BELOW",
130 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131 " BREVE BELOW;", " HORN AND GRAVE",
133 " DOT BELOW AND DOT ABOVE",
134 " RIGHT HALF RING", " HORN AND TILDE",
135 " CIRCUMFLEX AND DOT BELOW",
136 " BREVE AND DOT BELOW",
137 " DOT BELOW AND MACRON",
138 " HORN AND HOOK ABOVE",
140 // CIRCLED, PARENTHESIZED and so on
141 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
144 byte [] diacriticWeights = new byte [] {
146 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147 0x17, 0x19, 0x1A, 0x1B, 0x1C,
148 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
155 0x69, 0x69, 0x6A, 0x6D, 0x6E,
157 // CIRCLED, PARENTHESIZED and so on.
158 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
161 int [] numberSecondaryWeightBounds = new int [] {
162 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166 0xE50, 0xE60, 0xED0, 0xEE0
169 char [] orderedCyrillic;
170 char [] orderedGurmukhi;
171 char [] orderedGujarati;
172 char [] orderedGeorgian;
173 char [] orderedThaana;
175 static readonly char [] orderedTamilConsonants = new char [] {
176 // based on traditional Tamil consonants, except for
177 // Grantha (where Microsoft breaks traditionalism).
178 // http://www.angelfire.com/empire/thamizh/padanGaL
179 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
185 // cp -> character name (only for some characters)
186 ArrayList sortableCharNames = new ArrayList ();
188 // cp -> arrow value (int)
189 ArrayList arrowValues = new ArrayList ();
191 // cp -> box value (int)
192 ArrayList boxValues = new ArrayList ();
194 // cp -> level1 value
195 Hashtable arabicLetterPrimaryValues = new Hashtable ();
198 Hashtable arabicNameMap = new Hashtable ();
200 // cp -> Hashtable [decompType] -> cp
201 Hashtable nfkdMap = new Hashtable ();
203 // Latin letter -> ArrayList [int]
204 Hashtable latinMap = new Hashtable ();
206 ArrayList jisJapanese = new ArrayList ();
207 ArrayList nonJisJapanese = new ArrayList ();
209 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
210 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
211 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
212 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
213 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
215 byte [] ignorableFlags = new byte [char.MaxValue + 1];
217 static double [] unicodeAge = new double [char.MaxValue + 1];
219 ArrayList tailorings = new ArrayList ();
221 void Run (string [] args)
223 string dirname = args.Length == 0 ? "downloaded" : args [0];
224 ParseSources (dirname);
225 Console.Error.WriteLine ("parse done.");
227 ModifyParsedValues ();
229 Console.Error.WriteLine ("generation done.");
231 Console.Error.WriteLine ("serialization done.");
233 StreamWriter sw = new StreamWriter ("agelog.txt");
234 for (int i = 0; i < char.MaxValue; i++) {
235 bool shouldBe = false;
236 switch (Char.GetUnicodeCategory ((char) i)) {
237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
238 shouldBe = true; break;
240 if (unicodeAge [i] >= 3.1)
242 //if (IsIgnorable (i) != shouldBe)
243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
249 byte [] CompressArray (byte [] source, CodePointIndexer i)
251 return (byte []) CodePointIndexer.CompressArray (
252 source, typeof (byte), i);
255 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
257 return (ushort []) CodePointIndexer.CompressArray (
258 source, typeof (ushort), i);
264 SerializeTailorings ();
266 byte [] categories = new byte [map.Length];
267 byte [] level1 = new byte [map.Length];
268 byte [] level2 = new byte [map.Length];
269 byte [] level3 = new byte [map.Length];
270 int [] widthCompat = new int [map.Length];
271 for (int i = 0; i < map.Length; i++) {
272 categories [i] = map [i].Category;
273 level1 [i] = map [i].Level1;
274 level2 [i] = map [i].Level2;
275 level3 [i] = ComputeLevel3Weight ((char) i);
276 switch (decompType [i]) {
277 case DecompositionNarrow:
278 case DecompositionWide:
279 case DecompositionSuper:
280 case DecompositionSub:
281 // they are always 1 char
282 widthCompat [i] = decompValues [decompIndex [i]];
288 ignorableFlags = CompressArray (ignorableFlags,
289 MSCompatUnicodeTableUtil.Ignorable);
290 categories = CompressArray (categories,
291 MSCompatUnicodeTableUtil.Category);
292 level1 = CompressArray (level1,
293 MSCompatUnicodeTableUtil.Level1);
294 level2 = CompressArray (level2,
295 MSCompatUnicodeTableUtil.Level2);
296 level3 = CompressArray (level3,
297 MSCompatUnicodeTableUtil.Level3);
298 widthCompat = (int []) CodePointIndexer.CompressArray (
299 widthCompat, typeof (int),
300 MSCompatUnicodeTableUtil.WidthCompat);
301 cjkCHS = CompressArray (cjkCHS,
302 MSCompatUnicodeTableUtil.CjkCHS);
303 cjkCHT = CompressArray (cjkCHT,
304 MSCompatUnicodeTableUtil.Cjk);
305 cjkJA = CompressArray (cjkJA,
306 MSCompatUnicodeTableUtil.Cjk);
307 cjkKO = CompressArray (cjkKO,
308 MSCompatUnicodeTableUtil.Cjk);
309 cjkKOlv2 = CompressArray (cjkKOlv2,
310 MSCompatUnicodeTableUtil.Cjk);
313 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
314 for (int i = 0; i < ignorableFlags.Length; i++) {
315 byte value = ignorableFlags [i];
317 Result.Write ("{0},", value);
319 Result.Write ("0x{0:X02},", value);
320 if ((i & 0xF) == 0xF)
321 Result.WriteLine ("// {0:X04}", i - 0xF);
323 Result.WriteLine ("};");
327 Result.WriteLine ("static byte [] categories = new byte [] {");
328 for (int i = 0; i < categories.Length; i++) {
329 byte value = categories [i];
331 Result.Write ("{0},", value);
333 Result.Write ("0x{0:X02},", value);
334 if ((i & 0xF) == 0xF)
335 Result.WriteLine ("// {0:X04}", i - 0xF);
337 Result.WriteLine ("};");
340 // Primary weight value
341 Result.WriteLine ("static byte [] level1 = new byte [] {");
342 for (int i = 0; i < level1.Length; i++) {
343 byte value = level1 [i];
345 Result.Write ("{0},", value);
347 Result.Write ("0x{0:X02},", value);
348 if ((i & 0xF) == 0xF)
349 Result.WriteLine ("// {0:X04}", i - 0xF);
351 Result.WriteLine ("};");
355 Result.WriteLine ("static byte [] level2 = new byte [] {");
356 for (int i = 0; i < level2.Length; i++) {
357 int value = level2 [i];
359 Result.Write ("{0},", value);
361 Result.Write ("0x{0:X02},", value);
362 if ((i & 0xF) == 0xF)
363 Result.WriteLine ("// {0:X04}", i - 0xF);
365 Result.WriteLine ("};");
369 Result.WriteLine ("static byte [] level3 = new byte [] {");
370 for (int i = 0; i < level3.Length; i++) {
371 byte value = level3 [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
376 if ((i & 0xF) == 0xF)
377 Result.WriteLine ("// {0:X04}", i - 0xF);
379 Result.WriteLine ("};");
382 // Width insensitivity mappings
383 // (for now it is more lightweight than dumping the
384 // entire NFKD table).
385 Result.WriteLine ("static int [] widthCompat = new int [] {");
386 for (int i = 0; i < widthCompat.Length; i++) {
387 int value = widthCompat [i];
389 Result.Write ("{0},", value);
391 Result.Write ("0x{0:X02},", value);
392 if ((i & 0xF) == 0xF)
393 Result.WriteLine ("// {0:X04}", i - 0xF);
395 Result.WriteLine ("};");
399 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
400 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
401 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
402 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
403 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
406 void SerializeCJK (string name, ushort [] cjk, int max)
408 int offset = 0;//char.MaxValue - cjk.Length;
409 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
410 for (int i = 0; i < cjk.Length; i++) {
411 if (i + offset == max)
413 ushort value = cjk [i];
415 Result.Write ("{0},", value);
417 Result.Write ("0x{0:X04},", value);
418 if ((i & 0xF) == 0xF)
419 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
421 Result.WriteLine ("};");
425 void SerializeCJK (string name, byte [] cjk, int max)
427 int offset = 0;//char.MaxValue - cjk.Length;
428 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
429 for (int i = 0; i < cjk.Length; i++) {
430 if (i + offset == max)
432 byte value = cjk [i];
434 Result.Write ("{0},", value);
436 Result.Write ("0x{0:X02},", value);
437 if ((i & 0xF) == 0xF)
438 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
440 Result.WriteLine ("};");
444 void SerializeTailorings ()
446 Hashtable indexes = new Hashtable ();
447 Hashtable counts = new Hashtable ();
448 Result.WriteLine ("static char [] tailorings = new char [] {");
450 foreach (Tailoring t in tailorings) {
453 Result.Write ("/*{0}*/", t.LCID);
454 indexes.Add (t.LCID, count);
455 char [] values = t.ItemToCharArray ();
456 counts.Add (t.LCID, values.Length);
457 foreach (char c in values) {
458 Result.Write ("'\\x{0:X}', ", (int) c);
459 if (++count % 16 == 0)
460 Result.WriteLine (" // {0:X04}", count - 16);
463 Result.WriteLine ("};");
465 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
466 foreach (Tailoring t in tailorings) {
467 int target = t.Alias != 0 ? t.Alias : t.LCID;
468 if (!indexes.ContainsKey (target)) {
469 Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
472 int idx = (int) indexes [target];
473 int cnt = (int) counts [target];
474 bool french = t.FrenchSort;
476 foreach (Tailoring t2 in tailorings)
477 if (t2.LCID == t.LCID)
478 french = t2.FrenchSort;
479 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
481 Result.WriteLine ("};");
486 void ParseSources (string dirname)
489 dirname + "/UnicodeData.txt";
490 string derivedCoreProps =
491 dirname + "/DerivedCoreProperties.txt";
493 dirname + "/Scripts.txt";
495 dirname + "/CP932.TXT";
497 dirname + "/DerivedAge.txt";
498 string chXML = dirname + "/common/collation/zh.xml";
499 string jaXML = dirname + "/common/collation/ja.xml";
500 string koXML = dirname + "/common/collation/ko.xml";
502 ParseDerivedAge (derivedAge);
506 ParseJISOrder (cp932); // in prior to ParseUnidata()
507 ParseUnidata (unidata);
508 ParseDerivedCoreProperties (derivedCoreProps);
509 ParseScripts (scripts);
510 ParseCJK (chXML, jaXML, koXML);
512 ParseTailorings ("mono-tailoring-source.txt");
515 void ParseTailorings (string filename)
519 using (StreamReader sr = new StreamReader (filename)) {
521 while (sr.Peek () >= 0) {
523 ProcessTailoringLine (ref t,
524 sr.ReadLine ().Trim ());
526 } catch (Exception) {
527 Console.Error.WriteLine ("ERROR at line {0}", line);
533 // For now this is enough.
534 string ParseTailoringSourceValue (string s)
536 StringBuilder sb = new StringBuilder ();
537 for (int i = 0; i < s.Length; i++) {
538 if (s.StartsWith ("\\u")) {
539 sb.Append ((char) int.Parse (
540 s.Substring (2, 4), NumberStyles.HexNumber),
547 return sb.ToString ();
550 void ProcessTailoringLine (ref Tailoring t, string s)
552 int idx = s.IndexOf ('#');
554 s = s.Substring (0, idx).Trim ();
555 if (s.Length == 0 || s [0] == '#')
558 idx = s.IndexOf ('=');
561 int.Parse (s.Substring (1, idx - 1)),
562 int.Parse (s.Substring (idx + 1)));
564 t = new Tailoring (int.Parse (s.Substring (1)));
568 if (s.StartsWith ("*FrenchSort")) {
572 string d = "*Diacritical";
573 if (s.StartsWith (d)) {
574 idx = s.IndexOf ("->");
575 t.AddDiacriticalMap (
576 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
577 NumberStyles.HexNumber),
578 byte.Parse (s.Substring (idx + 2).Trim (),
579 NumberStyles.HexNumber));
582 idx = s.IndexOf (':');
584 string source = s.Substring (0, idx).Trim ();
585 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
586 byte [] b = new byte [5];
587 for (int i = 0; i < 5; i++) {
591 b [i] = byte.Parse (l [i],
592 NumberStyles.HexNumber);
594 t.AddSortKeyMap (ParseTailoringSourceValue (source),
597 idx = s.IndexOf ('=');
599 t.AddReplacementMap (
600 ParseTailoringSourceValue (
601 s.Substring (0, idx).Trim ()),
602 ParseTailoringSourceValue (
603 s.Substring (idx + 1).Trim ()));
606 void ParseDerivedAge (string filename)
608 using (StreamReader file =
609 new StreamReader (filename)) {
610 while (file.Peek () >= 0) {
611 string s = file.ReadLine ();
612 int idx = s.IndexOf ('#');
614 s = s.Substring (0, idx);
615 idx = s.IndexOf (';');
619 string cpspec = s.Substring (0, idx);
620 idx = cpspec.IndexOf ("..");
621 NumberStyles nf = NumberStyles.HexNumber |
622 NumberStyles.AllowTrailingWhite;
623 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
624 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
625 string value = s.Substring (cpspec.Length + 1).Trim ();
628 if (cp > char.MaxValue)
631 for (int i = cp; i <= cpEnd; i++)
632 unicodeAge [i] = double.Parse (value);
635 unicodeAge [0] = double.MaxValue; // never be supported
638 void ParseUnidata (string filename)
640 ArrayList decompValues = new ArrayList ();
641 using (StreamReader unidata =
642 new StreamReader (filename)) {
643 for (int line = 1; unidata.Peek () >= 0; line++) {
645 ProcessUnidataLine (unidata.ReadLine (), decompValues);
646 } catch (Exception) {
647 Console.Error.WriteLine ("**** At line " + line);
652 this.decompValues = (int [])
653 decompValues.ToArray (typeof (int));
656 void ProcessUnidataLine (string s, ArrayList decompValues)
658 int idx = s.IndexOf ('#');
660 s = s.Substring (0, idx);
661 idx = s.IndexOf (';');
664 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
665 string [] values = s.Substring (idx + 1).Split (';');
668 if (cp > char.MaxValue)
670 if (IsIgnorable (cp))
673 string name = values [0];
676 if (s.IndexOf ("SMALL CAPITAL") > 0)
677 isSmallCapital [cp] = true;
679 // latin mapping by character name
680 if (s.IndexOf ("LATIN") > 0) {
681 int lidx = s.IndexOf ("LETTER DOTLESS ");
682 int offset = lidx + 15;
684 lidx = s.IndexOf ("LETTER TURNED ");
688 lidx = s.IndexOf ("LETTER ");
691 char c = lidx > 0 ? s [offset] : char.MinValue;
692 if ('A' <= c && c <= 'Z' &&
693 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
694 ArrayList entry = (ArrayList) latinMap [c];
696 entry = new ArrayList ();
697 latinMap [c] = entry;
704 if (0x2000 <= cp && cp < 0x3000) {
706 // SPECIAL CASES. FIXME: why?
708 case 0x21C5: value = -1; break; // E2
709 case 0x261D: value = 1; break;
710 case 0x27A6: value = 3; break;
711 case 0x21B0: value = 7; break;
712 case 0x21B1: value = 3; break;
713 case 0x21B2: value = 7; break;
714 case 0x21B4: value = 5; break;
715 case 0x21B5: value = 7; break;
716 case 0x21B9: value = -1; break; // E1
717 case 0x21CF: value = 7; break;
718 case 0x21D0: value = 3; break;
720 string [] arrowTargets = new string [] {
732 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
733 if (s.IndexOf (arrowTargets [i]) > 0 &&
734 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
735 s.IndexOf (" OVER") < 0
739 arrowValues.Add (new DictionaryEntry (
744 if (0x2500 <= cp && cp < 0x25B0) {
747 // up:1 down:2 right:4 left:8 vert:16 horiz:32
750 // [dr] [dl] [ur] [ul]
754 ArrayList flags = new ArrayList (new int [] {
757 4 + 2, 8 + 2, 4 + 1, 8 + 1,
758 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
759 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
760 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
762 byte [] offsets = new byte [] {
769 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
771 if (s.IndexOf (" UP") > 0)
773 if (s.IndexOf (" DOWN") > 0)
775 if (s.IndexOf (" RIGHT") > 0)
777 if (s.IndexOf (" LEFT") > 0)
779 if (s.IndexOf (" VERTICAL") > 0)
781 if (s.IndexOf (" HORIZONTAL") > 0)
784 int fidx = flags.IndexOf (flag);
785 value = fidx < 0 ? fidx : offsets [fidx];
786 } else if (s.IndexOf ("BLOCK") > 0) {
787 if (s.IndexOf ("ONE EIGHTH") > 0)
789 else if (s.IndexOf ("ONE QUARTER") > 0)
791 else if (s.IndexOf ("THREE EIGHTHS") > 0)
793 else if (s.IndexOf ("HALF") > 0)
795 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
797 else if (s.IndexOf ("THREE QUARTERS") > 0)
799 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
805 boxValues.Add (new DictionaryEntry (
809 // For some characters store the name and sort later
810 // to determine sorting.
811 if (0x2100 <= cp && cp <= 0x213F &&
812 Char.IsSymbol ((char) cp))
813 sortableCharNames.Add (
814 new DictionaryEntry (cp, values [0]));
815 else if (0x3380 <= cp && cp <= 0x33DD)
816 sortableCharNames.Add (new DictionaryEntry (
817 cp, values [0].Substring (7)));
819 // diacritical weights by character name
820 for (int d = 0; d < diacritics.Length; d++)
821 if (s.IndexOf (diacritics [d]) > 0)
822 diacritical [cp] |= diacriticWeights [d];
823 // Two-step grep required for it.
824 if (s.IndexOf ("FULL STOP") > 0 &&
825 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
826 diacritical [cp] |= 0xF4;
828 // Arabic letter name
829 if (0x0621 <= cp && cp <= 0x064A &&
830 Char.GetUnicodeCategory ((char) cp)
831 == UnicodeCategory.OtherLetter) {
832 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
837 // hamza, waw, yeh ... special cases.
842 value = 0x77; // special cases.
845 // Get primary letter name i.e.
846 // XXX part of ARABIC LETTER XXX yyy
847 // e.g. that of "TEH MARBUTA" is "TEH".
850 // 0x0640 is special: it does
851 // not start with ARABIC LETTER
853 values [0].Substring (14);
854 int tmpIdx = letterName.IndexOf (' ');
855 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
856 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
857 if (arabicNameMap.ContainsKey (letterName))
858 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
860 arabicNameMap [letterName] = cp;
863 arabicLetterPrimaryValues [cp] = value;
866 // Japanese square letter
867 if (0x3300 <= cp && cp <= 0x3357)
869 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
872 string decomp = values [4];
873 idx = decomp.IndexOf ('<');
875 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
877 decompType [cp] = DecompositionFull;
880 decompType [cp] = DecompositionSub;
883 decompType [cp] = DecompositionSuper;
886 decompType [cp] = DecompositionSmall;
889 decompType [cp] = DecompositionIsolated;
892 decompType [cp] = DecompositionInitial;
895 decompType [cp] = DecompositionFinal;
898 decompType [cp] = DecompositionMedial;
901 decompType [cp] = DecompositionNoBreak;
904 decompType [cp] = DecompositionCompat;
907 decompType [cp] = DecompositionFraction;
910 decompType [cp] = DecompositionFont;
913 decompType [cp] = DecompositionCircle;
916 decompType [cp] = DecompositionSquare;
919 decompType [cp] = DecompositionWide;
922 decompType [cp] = DecompositionNarrow;
925 decompType [cp] = DecompositionVertical;
928 throw new Exception ("Support NFKD type : " + decomp);
932 decompType [cp] = DecompositionCanonical;
933 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
934 if (decomp.Length > 0) {
936 string [] velems = decomp.Split (' ');
937 int didx = decompValues.Count;
938 decompIndex [cp] = didx;
939 foreach (string v in velems)
940 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
941 decompLength [cp] = velems.Length;
943 // [decmpType] -> this_cp
944 int targetCP = (int) decompValues [didx];
945 // for "(x)" it specially maps to 'x' .
946 // FIXME: check if it is sane
947 if (velems.Length == 3 &&
948 (int) decompValues [didx] == '(' &&
949 (int) decompValues [didx + 2] == ')')
950 targetCP = (int) decompValues [didx + 1];
951 // special: 0x215F "1/"
952 else if (cp == 0x215F)
954 else if (velems.Length > 1 &&
955 (targetCP < 0x4C00 || 0x9FBB < targetCP))
956 // skip them, except for CJK ideograph compat
960 Hashtable entry = (Hashtable) nfkdMap [targetCP];
962 entry = new Hashtable ();
963 nfkdMap [targetCP] = entry;
965 entry [(byte) decompType [cp]] = cp;
969 if (values [5].Length > 0)
970 decimalValue [cp] = decimal.Parse (values [5]);
971 else if (values [6].Length > 0)
972 decimalValue [cp] = decimal.Parse (values [6]);
973 else if (values [7].Length > 0) {
974 string decstr = values [7];
975 idx = decstr.IndexOf ('/');
976 if (cp == 0x215F) // special. "1/"
977 decimalValue [cp] = 0x1;
981 decimal.Parse (decstr.Substring (0, idx))
982 / decimal.Parse (decstr.Substring (idx + 1));
983 else if (decstr [0] == '(' &&
984 decstr [decstr.Length - 1] == ')')
987 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
988 else if (decstr [decstr.Length - 1] == '.')
991 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
993 decimalValue [cp] = decimal.Parse (decstr);
997 void ParseDerivedCoreProperties (string filename)
1000 using (StreamReader file =
1001 new StreamReader (filename)) {
1002 for (int line = 1; file.Peek () >= 0; line++) {
1004 ProcessDerivedCorePropLine (file.ReadLine ());
1005 } catch (Exception) {
1006 Console.Error.WriteLine ("**** At line " + line);
1013 void ProcessDerivedCorePropLine (string s)
1015 int idx = s.IndexOf ('#');
1017 s = s.Substring (0, idx);
1018 idx = s.IndexOf (';');
1021 string cpspec = s.Substring (0, idx);
1022 idx = cpspec.IndexOf ("..");
1023 NumberStyles nf = NumberStyles.HexNumber |
1024 NumberStyles.AllowTrailingWhite;
1025 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1026 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1027 string value = s.Substring (cpspec.Length + 1).Trim ();
1030 if (cp > char.MaxValue)
1035 for (int x = cp; x <= cpEnd; x++)
1036 isUppercase [x] = true;
1041 void ParseScripts (string filename)
1043 ArrayList cyrillic = new ArrayList ();
1044 ArrayList gurmukhi = new ArrayList ();
1045 ArrayList gujarati = new ArrayList ();
1046 ArrayList georgian = new ArrayList ();
1047 ArrayList thaana = new ArrayList ();
1049 using (StreamReader file =
1050 new StreamReader (filename)) {
1051 while (file.Peek () >= 0) {
1052 string s = file.ReadLine ();
1053 int idx = s.IndexOf ('#');
1055 s = s.Substring (0, idx);
1056 idx = s.IndexOf (';');
1060 string cpspec = s.Substring (0, idx);
1061 idx = cpspec.IndexOf ("..");
1062 NumberStyles nf = NumberStyles.HexNumber |
1063 NumberStyles.AllowTrailingWhite;
1064 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1065 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1066 string value = s.Substring (cpspec.Length + 1).Trim ();
1069 if (cp > char.MaxValue)
1074 for (int x = cp; x <= cpEnd; x++)
1075 if (!IsIgnorable (x))
1076 cyrillic.Add ((char) x);
1079 for (int x = cp; x <= cpEnd; x++)
1080 if (!IsIgnorable (x))
1081 gurmukhi.Add ((char) x);
1084 for (int x = cp; x <= cpEnd; x++)
1085 if (!IsIgnorable (x))
1086 gujarati.Add ((char) x);
1089 for (int x = cp; x <= cpEnd; x++)
1090 if (!IsIgnorable (x))
1091 georgian.Add ((char) x);
1094 for (int x = cp; x <= cpEnd; x++)
1095 if (!IsIgnorable (x))
1096 thaana.Add ((char) x);
1101 cyrillic.Sort (UCAComparer.Instance);
1102 gurmukhi.Sort (UCAComparer.Instance);
1103 gujarati.Sort (UCAComparer.Instance);
1104 georgian.Sort (UCAComparer.Instance);
1105 thaana.Sort (UCAComparer.Instance);
1106 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1107 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1108 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1109 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1110 orderedThaana = (char []) thaana.ToArray (typeof (char));
1113 void ParseJISOrder (string filename)
1115 using (StreamReader file =
1116 new StreamReader (filename)) {
1117 while (file.Peek () >= 0) {
1118 string s = file.ReadLine ();
1119 int idx = s.IndexOf ('#');
1121 s = s.Substring (0, idx).Trim ();
1124 idx = s.IndexOf (' ');
1127 // They start with "0x" so cut them out.
1128 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1129 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1130 jisJapanese.Add (new JISCharacter (cp, jis));
1135 void ParseCJK (string zhXML, string jaXML, string koXML)
1137 XmlDocument doc = new XmlDocument ();
1138 doc.XmlResolver = null;
1145 // Chinese Simplified
1148 offset = 0;//char.MaxValue - arr.Length;
1150 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1152 foreach (char c in s) {
1154 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1156 arr [(int) c - offset] = (ushort) v++;
1162 // Chinese Traditional
1165 offset = 0;//char.MaxValue - arr.Length;
1166 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1168 foreach (char c in s) {
1170 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1172 arr [(int) c - offset] = (ushort) v++;
1181 offset = 0;//char.MaxValue - arr.Length;
1183 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1185 foreach (char c in s) {
1187 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1189 arr [(int) c - offset] = (ushort) v++;
1196 // Korean weight is somewhat complex. It first shifts
1197 // Hangul category from 52-x to 80-x (they are anyways
1198 // computed). CJK ideographs are placed at secondary
1199 // weight, like XX YY 01 zz 01, where XX and YY are
1200 // corresponding "reset" value and zz is 41,43,45...
1202 // Unlike chs,cht and ja, Korean value is a combined
1203 // ushort which is computed as category
1207 offset = 0;//char.MaxValue - arr.Length;
1209 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1210 XmlElement sc = (XmlElement) reset.NextSibling;
1211 // compute "category" and "level 1" for the
1212 // target "reset" Hangle syllable
1213 char rc = reset.InnerText [0];
1214 int ri = ((int) rc - 0xAC00) + 1;
1216 ((ri / 254) * 256 + (ri % 254) + 2);
1217 // Place the characters after the target.
1220 foreach (char c in s) {
1221 arr [(int) c - offset] = p;
1222 cjkKOlv2 [(int) c - offset] = (byte) v;
1232 void FillIgnorables ()
1234 for (int i = 0; i <= char.MaxValue; i++) {
1235 if (Char.GetUnicodeCategory ((char) i) ==
1236 UnicodeCategory.OtherNotAssigned)
1238 if (IsIgnorable (i))
1239 ignorableFlags [i] |= 1;
1240 if (IsIgnorableSymbol (i))
1241 ignorableFlags [i] |= 2;
1242 if (IsIgnorableNonSpacing (i))
1243 ignorableFlags [i] |= 4;
1247 void ModifyParsedValues ()
1249 // number, secondary weights
1251 int [] numarr = numberSecondaryWeightBounds;
1252 for (int i = 0; i < numarr.Length; i += 2, weight++)
1253 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1254 if (Char.IsNumber ((char) cp))
1255 diacritical [cp] = weight;
1257 // Korean parens numbers
1258 for (int i = 0x3200; i <= 0x321C; i++)
1259 diacritical [i] = 0xA;
1260 for (int i = 0x3260; i <= 0x327B; i++)
1261 diacritical [i] = 0xC;
1263 // Update name part of named characters
1264 for (int i = 0; i < sortableCharNames.Count; i++) {
1265 DictionaryEntry de =
1266 (DictionaryEntry) sortableCharNames [i];
1267 int cp = (int) de.Key;
1268 string renamed = null;
1270 case 0x2101: renamed = "A_1"; break;
1271 case 0x33C3: renamed = "A_2"; break;
1272 case 0x2105: renamed = "C_1"; break;
1273 case 0x2106: renamed = "C_2"; break;
1274 case 0x211E: renamed = "R1"; break;
1275 case 0x211F: renamed = "R2"; break;
1276 // Remove some of them!
1287 sortableCharNames.RemoveAt (i);
1291 if (renamed != null)
1292 sortableCharNames [i] =
1293 new DictionaryEntry (cp, renamed);
1297 void GenerateCore ()
1301 #region Specially ignored // 01
1302 // This will raise "Defined" flag up.
1303 foreach (char c in specialIgnore)
1304 map [(int) c] = new CharMapEntry (0, 0, 0);
1308 #region Variable weights
1309 // Controls : 06 03 - 06 3D
1311 for (int i = 0; i < 65536; i++) {
1312 if (IsIgnorable (i))
1315 uc = Char.GetUnicodeCategory (c);
1316 // NEL is whitespace but not ignored here.
1317 if (uc == UnicodeCategory.Control &&
1318 !Char.IsWhiteSpace (c) || c == '\u0085')
1319 AddCharMap (c, 6, 1);
1323 fillIndex [6] = 0x80;
1324 AddCharMapGroup ('\'', 6, 1, 0);
1325 AddCharMap ('\uFE63', 6, 1);
1327 // Hyphen/Dash : 06 81 - 06 90
1328 for (int i = 0; i < char.MaxValue; i++) {
1329 if (Char.GetUnicodeCategory ((char) i)
1330 == UnicodeCategory.DashPunctuation)
1331 // AddCharMapGroupTail ((char) i, 6, 1);
1332 AddCharMapGroup ((char) i, 6, 1, 0);
1335 // Arabic variable weight chars 06 A0 -
1336 fillIndex [6] = 0xA0;
1338 for (int i = 0x64B; i <= 0x650; i++)
1339 AddCharMapGroupTail ((char) i, 6, 1);
1341 AddCharMapGroup ('\u0652', 6, 1, 0);
1343 AddCharMapGroup ('\u0651', 6, 1, 0);
1347 #region Nonspacing marks // 01
1348 // FIXME: 01 03 - 01 B6 ... annoyance :(
1350 // Combining diacritical marks: 01 DC -
1352 fillIndex [0x1] = 0x41;
1353 for (int i = 0x030E; i <= 0x0326; i++)
1354 if (!IsIgnorable (i))
1355 AddCharMap ((char) i, 0x1, 1);
1356 for (int i = 0x0329; i <= 0x0334; i++)
1357 if (!IsIgnorable (i))
1358 AddCharMap ((char) i, 0x1, 1);
1359 for (int i = 0x0339; i <= 0x0341; i++)
1360 if (!IsIgnorable (i))
1361 AddCharMap ((char) i, 0x1, 1);
1362 fillIndex [0x1] = 0x72;
1363 for (int i = 0x0346; i <= 0x0348; i++)
1364 if (!IsIgnorable (i))
1365 AddCharMap ((char) i, 0x1, 1);
1366 for (int i = 0x02BE; i <= 0x02BF; i++)
1367 if (!IsIgnorable (i))
1368 AddCharMap ((char) i, 0x1, 1);
1369 for (int i = 0x02C1; i <= 0x02C5; i++)
1370 if (!IsIgnorable (i))
1371 AddCharMap ((char) i, 0x1, 1);
1372 for (int i = 0x02CE; i <= 0x02CF; i++)
1373 if (!IsIgnorable (i))
1374 AddCharMap ((char) i, 0x1, 1);
1375 for (int i = 0x02D1; i <= 0x02D3; i++)
1376 if (!IsIgnorable (i))
1377 AddCharMap ((char) i, 0x1, 1);
1378 AddCharMap ('\u02DE', 0x1, 1);
1379 for (int i = 0x02E4; i <= 0x02E9; i++)
1380 if (!IsIgnorable (i))
1381 AddCharMap ((char) i, 0x1, 1);
1383 // LAMESPEC: It should not stop at '\u20E1'. There are
1384 // a few more characters (that however results in
1385 // overflow of level 2 unless we start before 0xDD).
1386 fillIndex [0x1] = 0xDC;
1387 for (int i = 0x20d0; i <= 0x20e1; i++)
1388 AddCharMap ((char) i, 0x1, 1);
1392 #region Whitespaces // 07 03 -
1393 fillIndex [0x7] = 0x2;
1394 AddCharMap (' ', 0x7, 2);
1395 AddCharMap ('\u00A0', 0x7, 1);
1396 for (int i = 9; i <= 0xD; i++)
1397 AddCharMap ((char) i, 0x7, 1);
1398 for (int i = 0x2000; i <= 0x200B; i++)
1399 AddCharMap ((char) i, 0x7, 1);
1401 fillIndex [0x7] = 0x17;
1402 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1403 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1405 // Characters which used to represent layout control.
1406 // LAMESPEC: Windows developers seem to have thought
1407 // that those characters are kind of whitespaces,
1408 // while they aren't.
1409 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1410 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1413 // FIXME: 09 should be more complete.
1414 fillIndex [0x9] = 2;
1416 for (int cp = 0x2300; cp <= 0x237A; cp++)
1417 AddCharMap ((char) cp, 0x9, 1, 0);
1420 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1421 foreach (DictionaryEntry de in arrowValues) {
1422 int idx = (int) de.Value;
1423 int cp = (int) de.Key;
1424 if (map [cp].Defined)
1426 fillIndex [0x9] = (byte) (0xD8 + idx);
1427 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1431 byte [] boxLv2 = new byte [128];
1432 for (int i = 0; i < boxLv2.Length; i++)
1434 foreach (DictionaryEntry de in boxValues) {
1435 int cp = (int) de.Key;
1436 int idx = (int) de.Value;
1437 if (map [cp].Defined)
1439 fillIndex [0x9] = (byte) (0xE5 + idx);
1440 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1443 // Some special characters (slanted)
1444 fillIndex [0x9] = 0xF4;
1445 AddCharMap ('\u2571', 0x9, 3);
1446 AddCharMap ('\u2572', 0x9, 3);
1447 AddCharMap ('\u2573', 0x9, 3);
1449 // FIXME: implement 0A
1451 fillIndex [0xA] = 2;
1452 // byte currency symbols
1453 for (int cp = 0; cp < 0x100; cp++) {
1454 uc = Char.GetUnicodeCategory ((char) cp);
1455 if (!IsIgnorable (cp) &&
1456 uc == UnicodeCategory.CurrencySymbol &&
1458 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1460 // byte other symbols
1461 for (int cp = 0; cp < 0x100; cp++) {
1463 continue; // SPECIAL: skip FIXME: why?
1464 uc = Char.GetUnicodeCategory ((char) cp);
1465 if (!IsIgnorable (cp) &&
1466 uc == UnicodeCategory.OtherSymbol)
1467 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1470 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1471 for (int cp = 0x2600; cp <= 0x2613; cp++)
1472 AddCharMap ((char) cp, 0xA, 1, 0);
1474 for (int cp = 0x2620; cp <= 0x2770; cp++)
1475 if (Char.IsSymbol ((char) cp))
1476 AddCharMap ((char) cp, 0xA, 1, 0);
1478 for (int i = 0x2440; i < 0x2460; i++)
1479 AddCharMap ((char) i, 0xA, 1, 0);
1483 #region Numbers // 0C 02 - 0C E1
1484 fillIndex [0xC] = 2;
1486 // 9F8 : Bengali "one less than the denominator"
1487 AddCharMap ('\u09F8', 0xC, 1);
1489 ArrayList numbers = new ArrayList ();
1490 for (int i = 0; i < 65536; i++)
1491 if (!IsIgnorable (i) &&
1492 Char.IsNumber ((char) i) &&
1493 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1496 ArrayList numberValues = new ArrayList ();
1497 foreach (int i in numbers)
1498 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1499 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1501 //foreach (DictionaryEntry de in numberValues)
1502 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1504 decimal prevValue = -1;
1505 foreach (DictionaryEntry de in numberValues) {
1506 int cp = (int) de.Key;
1507 decimal currValue = (decimal) de.Value;
1508 bool addnew = false;
1509 if (prevValue < currValue &&
1510 prevValue - (int) prevValue == 0 &&
1514 // Process Hangzhou and Roman numbers
1516 // There are some SPECIAL cases.
1517 if (currValue != 4) // no increment for 4
1521 xcp = (int) prevValue + 0x2170 - 1;
1522 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1523 xcp = (int) prevValue + 0x2160 - 1;
1524 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1525 fillIndex [0xC] += 2;
1526 xcp = (int) prevValue + 0x3021 - 1;
1527 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1530 if (prevValue < currValue)
1531 prevValue = currValue;
1532 if (map [cp].Defined)
1534 // HangZhou and Roman are add later
1536 else if (0x3021 <= cp && cp < 0x302A
1537 || 0x2160 <= cp && cp < 0x216A
1538 || 0x2170 <= cp && cp < 0x217A)
1541 if (cp == 0x215B) // FIXME: why?
1542 fillIndex [0xC] += 2;
1543 else if (cp == 0x3021) // FIXME: why?
1545 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1547 if (addnew || cp <= '9') {
1549 if (1 <= currValue && currValue <= 10) {
1550 xcp = cp - 0x31 + 0x2776;
1551 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1552 xcp = cp - 0x31 + 0x2780;
1553 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1554 xcp = cp - 0x31 + 0x278A;
1555 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1557 if (1 <= currValue && currValue <= 20) {
1558 xcp = cp - 0x31 + 0x2460;
1559 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1560 xcp = cp - 0x31 + 0x2474;
1561 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1562 xcp = cp - 0x31 + 0x2488;
1563 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1567 if (cp != 0x09E7 && cp != 0x09EA)
1570 // Add special cases that are not regarded as
1571 // numbers in UnicodeCategory speak.
1574 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1575 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1577 else if (cp == '6') // FIXME: why?
1582 fillIndex [0xC] = 0xFF;
1583 AddCharMap ('\u221E', 0xC, 1);
1586 #region Letters and NonSpacing Marks (general)
1588 // ASCII Latin alphabets
1589 for (int i = 0; i < alphabets.Length; i++)
1590 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1593 // non-ASCII Latin alphabets
1594 // FIXME: there is no such characters that are placed
1595 // *after* "alphabets" array items. This is nothing
1596 // more than a hack that creates dummy weight for
1597 // primary characters.
1598 for (int i = 0x0080; i < 0x0300; i++) {
1599 if (!Char.IsLetter ((char) i))
1601 // For those Latin Letters which has NFKD are
1602 // not added as independent primary character.
1603 if (decompIndex [i] != 0)
1606 // 1.some alphabets have primarily
1607 // equivalent ASCII alphabets.
1608 // 2.some have independent primary weights,
1609 // but inside a-to-z range.
1610 // 3.there are some expanded characters that
1611 // are not part of Unicode Standard NFKD.
1613 // 1. skipping them does not make sense
1614 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1615 // case 0x184: case 0x185: case 0x186: case 0x189:
1616 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1617 // case 0x194: case 0x195: case 0x196: case 0x19A:
1618 // case 0x19B: case 0x19C:
1619 // 2. skipping them does not make sense
1620 // case 0x14A: // Ng
1621 // case 0x14B: // ng
1625 case 0xDE: // Icelandic Thorn
1626 case 0xFE: // Icelandic Thorn
1627 case 0xDF: // German ss
1628 case 0xFF: // German ss
1629 // not classified yet
1630 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1631 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1632 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1633 // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1637 AddCharMapGroup ((char) i, 0xE, 1, 0);
1641 fillIndex [0xF] = 02;
1642 for (int i = 0x0380; i < 0x0390; i++)
1643 if (Char.IsLetter ((char) i))
1644 AddLetterMap ((char) i, 0xF, 1);
1645 fillIndex [0xF] = 02;
1646 for (int i = 0x0391; i < 0x03CF; i++)
1647 if (Char.IsLetter ((char) i))
1648 AddLetterMap ((char) i, 0xF, 1);
1649 fillIndex [0xF] = 0x40;
1650 for (int i = 0x03D0; i < 0x0400; i++)
1651 if (Char.IsLetter ((char) i))
1652 AddLetterMap ((char) i, 0xF, 1);
1654 // Cyrillic - UCA order w/ some modification
1655 fillIndex [0x10] = 0x3;
1656 // table which is moslty from UCA DUCET.
1657 for (int i = 0; i < orderedCyrillic.Length; i++) {
1658 char c = orderedCyrillic [i];
1659 if (Char.IsLetter (c))
1660 AddLetterMap (c, 0x10, 3);
1662 for (int i = 0x0460; i < 0x0481; i++) {
1663 if (Char.IsLetter ((char) i))
1664 AddLetterMap ((char) i, 0x10, 3);
1668 fillIndex [0x11] = 0x3;
1669 for (int i = 0x0531; i < 0x0586; i++)
1670 if (Char.IsLetter ((char) i))
1671 AddLetterMap ((char) i, 0x11, 1);
1675 fillIndex [0x12] = 0x3;
1676 for (int i = 0x05D0; i < 0x05FF; i++)
1677 if (Char.IsLetter ((char) i))
1678 AddLetterMap ((char) i, 0x12, 1);
1680 fillIndex [0x1] = 0x3;
1681 for (int i = 0x0591; i <= 0x05C2; i++)
1683 AddCharMap ((char) i, 0x1, 1);
1686 fillIndex [0x1] = 0x8E;
1687 fillIndex [0x13] = 0x3;
1688 for (int i = 0x0621; i <= 0x064A; i++) {
1690 if (Char.GetUnicodeCategory ((char) i)
1691 != UnicodeCategory.OtherLetter) {
1692 // FIXME: arabic nonspacing marks are
1693 // in different order.
1694 AddCharMap ((char) i, 0x1, 1);
1697 // map [i] = new CharMapEntry (0x13,
1698 // (byte) arabicLetterPrimaryValues [i], 1);
1700 (byte) arabicLetterPrimaryValues [i];
1701 AddLetterMap ((char) i, 0x13, 0);
1703 fillIndex [0x13] = 0x84;
1704 for (int i = 0x0674; i < 0x06D6; i++)
1705 if (Char.IsLetter ((char) i))
1706 AddLetterMap ((char) i, 0x13, 1);
1709 // FIXME: it does seem straight codepoint mapping.
1710 fillIndex [0x14] = 04;
1711 for (int i = 0x0901; i < 0x0905; i++)
1712 if (!IsIgnorable (i))
1713 AddLetterMap ((char) i, 0x14, 2);
1714 fillIndex [0x14] = 0xB;
1715 for (int i = 0x0905; i < 0x093A; i++)
1716 if (Char.IsLetter ((char) i))
1717 AddLetterMap ((char) i, 0x14, 4);
1718 for (int i = 0x093E; i < 0x094F; i++)
1719 if (!IsIgnorable (i))
1720 AddLetterMap ((char) i, 0x14, 2);
1724 fillIndex [0x15] = 02;
1725 for (int i = 0x0980; i < 0x9FF; i++) {
1726 if (IsIgnorable (i))
1729 fillIndex [0x15] = 0x3B;
1730 switch (Char.GetUnicodeCategory ((char) i)) {
1731 case UnicodeCategory.NonSpacingMark:
1732 case UnicodeCategory.DecimalDigitNumber:
1733 case UnicodeCategory.OtherNumber:
1736 AddLetterMap ((char) i, 0x15, 1);
1739 fillIndex [0x1] = 0x3;
1740 for (int i = 0x0981; i < 0x0A00; i++)
1741 if (Char.GetUnicodeCategory ((char) i) ==
1742 UnicodeCategory.NonSpacingMark)
1743 AddCharMap ((char) i, 0x1, 1);
1745 // Gurmukhi. orderedGurmukhi is from UCA
1746 // FIXME: it does not look equivalent to UCA.
1747 fillIndex [0x1] = 03;
1748 fillIndex [0x16] = 02;
1749 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1750 char c = orderedGurmukhi [i];
1751 if (IsIgnorable ((int) c))
1753 if (!Char.IsLetter (c)) {
1754 AddLetterMap (c, 0x1, 1);
1757 if (c == '\u0A3C' || c == '\u0A4D' ||
1758 '\u0A66' <= c && c <= '\u0A71')
1760 AddLetterMap (c, 0x16, 4);
1763 // Gujarati. orderedGujarati is from UCA
1764 fillIndex [0x17] = 02;
1765 for (int i = 0; i < orderedGujarati.Length; i++)
1766 AddLetterMap (orderedGujarati [i], 0x17, 4);
1769 fillIndex [0x18] = 02;
1770 for (int i = 0x0B00; i < 0x0B7F; i++) {
1771 switch (Char.GetUnicodeCategory ((char) i)) {
1772 case UnicodeCategory.NonSpacingMark:
1773 case UnicodeCategory.DecimalDigitNumber:
1776 AddLetterMap ((char) i, 0x18, 1);
1780 fillIndex [0x19] = 2;
1781 AddCharMap ('\u0BD7', 0x19, 0);
1782 fillIndex [0x19] = 0xA;
1784 for (int i = 0x0BD7; i < 0x0B94; i++)
1785 if (Char.IsLetter ((char) i))
1786 AddCharMap ((char) i, 0x19, 2);
1788 fillIndex [0x19] = 0x24;
1789 AddCharMap ('\u0B94', 0x19, 0);
1790 fillIndex [0x19] = 0x26;
1791 // The array for Tamil consonants is a constant.
1792 // Windows have almost similar sequence to TAM from
1793 // tamilnet but a bit different in Grantha.
1794 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1795 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1797 fillIndex [0x19] = 0x82;
1798 for (int i = 0x0BBE; i < 0x0BCD; i++)
1799 if (Char.GetUnicodeCategory ((char) i) ==
1800 UnicodeCategory.SpacingCombiningMark
1802 AddLetterMap ((char) i, 0x19, 2);
1805 fillIndex [0x1A] = 0x4;
1806 for (int i = 0x0C00; i < 0x0C62; i++) {
1807 if (i == 0x0C55 || i == 0x0C56)
1809 AddCharMap ((char) i, 0x1A, 3);
1810 char supp = (i == 0x0C0B) ? '\u0C60':
1811 i == 0x0C0C ? '\u0C61' : char.MinValue;
1812 if (supp == char.MinValue)
1814 AddCharMap (supp, 0x1A, 3);
1818 fillIndex [0x1B] = 4;
1819 for (int i = 0x0C80; i < 0x0CE5; i++) {
1820 if (i == 0x0CD5 || i == 0x0CD6)
1822 AddCharMap ((char) i, 0x1B, 3);
1826 fillIndex [0x1C] = 2;
1827 for (int i = 0x0D02; i < 0x0D61; i++)
1828 // FIXME: I avoided MSCompatUnicodeTable usage
1829 // here (it results in recursion). So check if
1830 // using NonSpacingMark makes sense or not.
1831 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1832 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1833 AddCharMap ((char) i, 0x1C, 1);
1835 // Thai ... note that it breaks 0x1E wall after E2B!
1836 // Also, all Thai characters have level 2 value 3.
1837 fillIndex [0x1E] = 2;
1838 for (int i = 0xE44; i < 0xE48; i++)
1839 AddCharMap ((char) i, 0x1E, 1, 3);
1840 for (int i = 0xE01; i < 0xE2B; i++)
1841 AddCharMap ((char) i, 0x1E, 6, 0);
1842 fillIndex [0x1F] = 5;
1843 for (int i = 0xE2B; i < 0xE30; i++)
1844 AddCharMap ((char) i, 0x1F, 6, 0);
1845 for (int i = 0xE30; i < 0xE3B; i++)
1846 AddCharMap ((char) i, 0x1F, 1, 3);
1847 // some Thai characters remains.
1848 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1849 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1850 foreach (char c in specialThai)
1851 AddCharMap (c, 0x1F, 1);
1854 fillIndex [0x1F] = 2;
1855 for (int i = 0xE80; i < 0xEDF; i++)
1856 if (Char.IsLetter ((char) i))
1857 AddCharMap ((char) i, 0x1F, 1);
1859 // Georgian. orderedGeorgian is from UCA DUCET.
1860 fillIndex [0x21] = 5;
1861 for (int i = 0; i < orderedGeorgian.Length; i++)
1862 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1865 fillIndex [0x22] = 2;
1866 int kanaOffset = 0x3041;
1867 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1869 for (int gyo = 0; gyo < 9; gyo++) {
1870 for (int dan = 0; dan < 5; dan++) {
1871 if (gyo == 7 && dan % 2 == 1) {
1874 kanaOffset -= 2; // There is no space for yi and ye.
1877 int cp = kanaOffset + dan * kanaLines [gyo];
1878 // small lines (a-gyo, ya-gyo)
1879 if (gyo == 0 || gyo == 7) {
1880 AddKanaMap (cp, 1); // small
1881 AddKanaMap (cp + 1, 1);
1884 AddKanaMap (cp, kanaLines [gyo]);
1888 // add small 'Tsu' (before normal one)
1889 AddKanaMap (0x3063, 1);
1893 fillIndex [0x22] += 3;
1894 kanaOffset += 5 * kanaLines [gyo];
1897 // Wa-gyo is almost special, so I just manually add.
1898 AddLetterMap ((char) 0x308E, 0x22, 0);
1899 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1900 AddLetterMap ((char) 0x308F, 0x22, 0);
1901 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1903 AddLetterMap ((char) 0x3090, 0x22, 0);
1904 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1905 fillIndex [0x22] += 2;
1906 // no "Wu" in Japanese.
1907 AddLetterMap ((char) 0x3091, 0x22, 0);
1908 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1910 AddLetterMap ((char) 0x3092, 0x22, 0);
1911 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1913 fillIndex [0x22] = 0x80;
1914 AddLetterMap ((char) 0x3093, 0x22, 0);
1915 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1917 // JIS Japanese square chars.
1918 fillIndex [0x22] = 0x97;
1919 jisJapanese.Sort (JISComparer.Instance);
1920 foreach (JISCharacter j in jisJapanese)
1921 AddCharMap ((char) j.CP, 0x22, 1);
1922 // non-JIS Japanese square chars.
1923 nonJisJapanese.Sort (NonJISComparer.Instance);
1924 foreach (NonJISCharacter j in nonJisJapanese)
1925 AddCharMap ((char) j.CP, 0x22, 1);
1928 fillIndex [0x23] = 0x02;
1929 for (int i = 0x3105; i <= 0x312C; i++)
1930 AddCharMap ((char) i, 0x23, 1);
1932 // Estrangela: ancient Syriac
1933 fillIndex [0x24] = 0x0B;
1934 // FIXME: is 0x71E really alternative form?
1935 ArrayList syriacAlternatives = new ArrayList (
1936 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1937 for (int i = 0x0710; i <= 0x072C; i++) {
1938 if (i == 0x0711) // NonSpacingMark
1940 if (syriacAlternatives.Contains (i))
1942 AddCharMap ((char) i, 0x24, 4);
1947 foreach (int cp in syriacAlternatives)
1948 map [cp] = new CharMapEntry (0x24,
1949 (byte) (map [cp - 1].Level1 + 2),
1953 // FIXME: it turned out that it does not look like UCA
1954 fillIndex [0x24] = 0x6E;
1955 for (int i = 0; i < orderedThaana.Length; i++) {
1956 if (IsIgnorableNonSpacing (i))
1958 AddCharMap (orderedThaana [i], 0x24, 2);
1962 // FIXME: Add more culture-specific letters (that are
1963 // not supported in Windows collation) here.
1965 // Surrogate ... they are computed.
1970 // Unlike UCA Windows Hangul sequence mixes Jongseong
1971 // with Choseong sequence as well as Jungseong,
1972 // adjusted to have the same primary weight for the
1973 // same base character. So it is impossible to compute
1976 // Here I introduce an ordered sequence of mixed
1977 // 'commands' and 'characters' that is similar to
1979 // - ',' increases primary weight.
1980 // - [A B] means a range, increasing index
1981 // - {A B} means a range, without increasing index
1982 // - '=' is no operation (it means the characters
1983 // of both sides have the same weight).
1984 // - '>' inserts a Hangul Syllable block that
1985 // contains 0x251 characters.
1986 // - '<' decreases the index
1987 // - '0'-'9' means skip count
1988 // - whitespaces are ignored
1991 string hangulSequence =
1992 + "\u1100=\u11A8 > \u1101=\u11A9 >"
1993 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1994 + "<{\u1113 \u1116}, \u3165,"
1995 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1996 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
1997 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
1998 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
1999 + "[\u11D1 \u11D2], \u11B2,"
2000 + "[\u11D3 \u11D5], \u11B3,"
2001 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2002 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2003 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2004 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2005 + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2006 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2007 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2008 + "\u11EA,, \u110A=\u11BB,,, >"
2009 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2010 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2011 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2012 + "\u11F1,, \u11F2,,,"
2013 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2014 + "<\u114D, \u110D,, >"
2015 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2016 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2017 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2018 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2019 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2023 byte hangulCat = 0x52;
2024 fillIndex [hangulCat] = 0x2;
2026 int syllableBlock = 0;
2027 for (int n = 0; n < hangulSequence.Length; n++) {
2028 char c = hangulSequence [n];
2030 if (Char.IsWhiteSpace (c))
2036 IncrementSequentialIndex (ref hangulCat);
2039 if (fillIndex [hangulCat] == 2)
2040 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2041 fillIndex [hangulCat]--;
2044 IncrementSequentialIndex (ref hangulCat);
2045 for (int l = 0; l < 0x15; l++)
2046 for (int v = 0; v < 0x1C; v++) {
2048 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2049 IncrementSequentialIndex (ref hangulCat);
2054 start = hangulSequence [n + 1];
2055 end = hangulSequence [n + 3];
2056 for (int i = start; i <= end; i++) {
2057 AddCharMap ((char) i, hangulCat, 0);
2059 IncrementSequentialIndex (ref hangulCat);
2061 n += 4; // consumes 5 characters for this operation
2064 start = hangulSequence [n + 1];
2065 end = hangulSequence [n + 3];
2066 for (int i = start; i <= end; i++)
2067 AddCharMap ((char) i, hangulCat, 0);
2068 n += 4; // consumes 5 characters for this operation
2071 AddCharMap (c, hangulCat, 0);
2078 // Letterlike characters and CJK compatibility square
2079 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2080 int [] counts = new int ['Z' - 'A' + 1];
2081 char [] namedChars = new char [sortableCharNames.Count];
2083 foreach (DictionaryEntry de in sortableCharNames) {
2084 counts [((string) de.Value) [0] - 'A']++;
2085 namedChars [nCharNames++] = (char) ((int) de.Key);
2087 nCharNames = 0; // reset
2088 for (int a = 0; a < counts.Length; a++) {
2089 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2090 for (int i = 0; i < counts [a]; i++)
2091 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2092 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2095 // CJK unified ideograph.
2097 fillIndex [cjkCat] = 0x2;
2098 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2099 if (!IsIgnorable (cp))
2100 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2101 // CJK Extensions goes here.
2102 // LAMESPEC: With this Windows style CJK layout, it is
2103 // impossible to add more CJK ideograph i.e. 0x9FA6-
2104 // 0x9FBB can never be added w/o breaking compat.
2105 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2106 if (!IsIgnorable (cp))
2107 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2109 // PrivateUse ... computed.
2110 // remaining Surrogate ... computed.
2112 #region Special "biggest" area (FF FF)
2113 fillIndex [0xFF] = 0xFF;
2114 char [] specialBiggest = new char [] {
2115 '\u3005', '\u3031', '\u3032', '\u309D',
2116 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2117 '\uFE7C', '\uFE7D', '\uFF70'};
2118 foreach (char c in specialBiggest)
2119 AddCharMap (c, 0xFF, 0);
2122 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2123 // non-alphanumeric ASCII except for: + - < = > '
2124 for (int i = 0x21; i < 0x7F; i++) {
2125 if (Char.IsLetterOrDigit ((char) i)
2126 || "+-<=>'".IndexOf ((char) i) >= 0)
2127 continue; // they are not added here.
2128 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2129 // Insert 3001 after ',' and 3002 after '.'
2131 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2132 else if (i == 0x2E) {
2134 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2137 AddCharMap ('\uFE30', 0x7, 1, 0);
2141 #region 07 - Punctuations and something else
2142 for (int i = 0xA0; i < char.MaxValue; i++) {
2143 if (IsIgnorable (i))
2155 switch (Char.GetUnicodeCategory ((char) i)) {
2156 case UnicodeCategory.OtherPunctuation:
2157 case UnicodeCategory.ClosePunctuation:
2158 case UnicodeCategory.OpenPunctuation:
2159 case UnicodeCategory.InitialQuotePunctuation:
2160 case UnicodeCategory.FinalQuotePunctuation:
2161 case UnicodeCategory.ModifierSymbol:
2162 // SPECIAL CASES: // 0xA
2163 if (0x2020 <= i && i <= 0x2042)
2165 AddCharMapGroup ((char) i, 0x7, 1, 0);
2168 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2169 goto case UnicodeCategory.OtherPunctuation;
2174 for (int i = 0x2400; i <= 0x2421; i++)
2175 AddCharMap ((char) i, 0x7, 1, 0);
2178 // FIXME: for 07 xx we need more love.
2180 // FIXME: 08 should be more complete.
2181 fillIndex [0x8] = 2;
2182 for (int cp = 0; cp < char.MaxValue; cp++)
2183 if (!map [cp].Defined &&
2184 Char.GetUnicodeCategory ((char) cp) ==
2185 UnicodeCategory.MathSymbol)
2186 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2188 // Characters w/ diacritical marks (NFKD)
2189 for (int i = 0; i <= char.MaxValue; i++) {
2190 if (map [i].Defined || IsIgnorable (i))
2192 if (decompIndex [i] == 0)
2195 int start = decompIndex [i];
2196 int primaryChar = decompValues [start];
2199 int length = decompLength [i];
2200 // special processing for parenthesized ones.
2202 decompValues [start] == '(' &&
2203 decompValues [start + 2] == ')') {
2204 primaryChar = decompValues [start + 1];
2208 if (map [primaryChar].Level1 == 0)
2211 for (int l = 1; l < length; l++) {
2212 int c = decompValues [start + l];
2213 if (map [c].Level1 != 0)
2215 secondary += diacritical [c];
2219 map [i] = new CharMapEntry (
2220 map [primaryChar].Category,
2221 map [primaryChar].Level1,
2226 #region Level2 adjustment
2228 diacritical [0x624] = 0x5;
2229 diacritical [0x626] = 0x7;
2230 diacritical [0x622] = 0x9;
2231 diacritical [0x623] = 0xA;
2232 diacritical [0x625] = 0xB;
2233 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2234 diacritical [0x64A] = 0x7; // Yaa'
2237 for (int i = 0; i < char.MaxValue; i++) {
2239 byte cat = map [i].Category;
2241 case 0xE: // Latin diacritics
2242 case 0x22: // Japanese: circled characters
2243 mod = diacritical [i];
2245 case 0x13: // Arabic
2246 if (diacritical [i] == 0)
2247 mod = 0x8; // default for arabic
2250 if (0x52 <= cat && cat <= 0x7F) // Hangul
2251 mod = diacritical [i];
2253 map [i] = new CharMapEntry (
2254 cat, map [i].Level1, mod);
2258 // FIXME: this is hack but those which are
2259 // NonSpacingMark characters and still undefined
2260 // are likely to be nonspacing.
2261 for (int i = 0; i < char.MaxValue; i++)
2262 if (!map [i].Defined &&
2264 Char.GetUnicodeCategory ((char) i) ==
2265 UnicodeCategory.NonSpacingMark)
2266 AddCharMap ((char) i, 1, 1);
2269 private void IncrementSequentialIndex (ref byte hangulCat)
2271 fillIndex [hangulCat]++;
2272 if (fillIndex [hangulCat] == 0) { // overflown
2274 fillIndex [hangulCat] = 0x2;
2278 // Reset fillIndex to fixed value and call AddLetterMap().
2279 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2281 fillIndex [category] = alphaWeight;
2282 AddLetterMap (c, category, 0);
2284 ArrayList al = latinMap [c] as ArrayList;
2288 foreach (int cp in al)
2289 AddLetterMap ((char) cp, category, 0);
2292 private void AddKanaMap (int i, byte voices)
2294 for (byte b = 0; b < voices; b++) {
2295 char c = (char) (i + b);
2296 byte arg = (byte) (b > 0 ? b + 2 : 0);
2298 AddLetterMapCore (c, 0x22, 0, arg);
2300 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2304 private void AddLetterMap (char c, byte category, byte updateCount)
2306 AddLetterMapCore (c, category, updateCount, 0);
2309 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2312 // <small> updates index
2313 c2 = ToSmallForm (c);
2315 AddCharMapGroup (c2, category, updateCount, level2);
2316 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2317 if (c2 != c && !map [(int) c2].Defined)
2318 AddLetterMapCore (c2, category, 0, level2);
2319 bool doUpdate = true;
2320 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2323 AddCharMapGroup (c, category, 0, level2);
2325 fillIndex [category] += updateCount;
2328 private bool AddCharMap (char c, byte category, byte increment)
2330 return AddCharMap (c, category, increment, 0);
2333 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2335 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2336 return false; // do nothing
2337 map [(int) c] = new CharMapEntry (category,
2338 category == 1 ? alt : fillIndex [category],
2339 category == 1 ? fillIndex [category] : alt);
2340 fillIndex [category] += increment;
2344 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2346 char c2 = ToSmallFormTail (c);
2348 AddCharMap (c2, category, updateCount, 0);
2350 AddCharMap (c, category, updateCount, 0);
2352 c2 = ToFullWidthTail (c);
2354 AddCharMapGroupTail (c2, category, updateCount);
2358 // Adds characters to table in the order below
2359 // (+ increases weight):
2363 // <full> | <super> | <sub>
2364 // <circle> | <wide> (| <narrow>)
2368 // level2 is fixed (does not increase).
2369 int [] sameWeightItems = new int [] {
2370 DecompositionFraction,
2374 DecompositionCircle,
2376 DecompositionNarrow,
2378 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2380 if (map [(int) c].Defined)
2383 char small = char.MinValue;
2384 char vertical = char.MinValue;
2385 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2387 object smv = nfkd [(byte) DecompositionSmall];
2389 small = (char) ((int) smv);
2390 object vv = nfkd [(byte) DecompositionVertical];
2392 vertical = (char) ((int) vv);
2395 // <small> updates index
2396 if (small != char.MinValue)
2397 AddCharMap (small, category, updateCount);
2400 AddCharMap (c, category, 0, level2);
2403 foreach (int weight in sameWeightItems) {
2404 object wv = nfkd [(byte) weight];
2406 AddCharMap ((char) ((int) wv), category, 0, level2);
2410 // update index here.
2411 fillIndex [category] += updateCount;
2413 if (vertical != char.MinValue)
2414 AddCharMap (vertical, category, updateCount, level2);
2417 private void AddCharMapCJK (char c, ref byte category)
2419 AddCharMap (c, category, 0, 0);
2420 IncrementSequentialIndex (ref category);
2422 // Special. I wonder why but Windows skips 9E F9.
2423 if (category == 0x9E && fillIndex [category] == 0xF9)
2424 IncrementSequentialIndex (ref category);
2427 private void AddCharMapGroupCJK (char c, ref byte category)
2429 AddCharMapCJK (c, ref category);
2431 // LAMESPEC: see below.
2432 if (c == '\u52DE') {
2433 AddCharMapCJK ('\u3298', ref category);
2434 AddCharMapCJK ('\u3238', ref category);
2437 AddCharMapCJK ('\u32A2', ref category);
2439 // Especially this mapping order totally does
2440 // not make sense to me.
2441 AddCharMapCJK ('\u32A9', ref category);
2443 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2446 for (byte weight = 0; weight <= 0x12; weight++) {
2447 object wv = nfkd [weight];
2452 // Special: they are ignored in this area.
2453 // FIXME: check if it is sane
2454 if (0xF900 <= w && w <= 0xFAD9)
2456 // LAMESPEC: on Windows some of CJK characters
2457 // in 3200-32B0 are incorrectly mapped. They
2458 // mix Chinise and Japanese Kanji when
2459 // ordering those characters.
2461 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2465 AddCharMapCJK ((char) w, ref category);
2469 // For now it is only for 0x7 category.
2470 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2472 char small = char.MinValue;
2473 char vertical = char.MinValue;
2474 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2476 object smv = nfkd [(byte) DecompositionSmall];
2478 small = (char) ((int) smv);
2479 object vv = nfkd [(byte) DecompositionVertical];
2481 vertical = (char) ((int) vv);
2484 // <small> updates index
2485 if (small != char.MinValue)
2486 // SPECIAL CASE excluded (FIXME: why?)
2487 if (small != '\u2024')
2488 AddCharMap (small, category, updateCount);
2491 AddCharMap (c, category, updateCount, level2);
2493 // Since nfkdMap is problematic to have two or more
2494 // NFKD to an identical character, here I iterate all.
2495 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2496 if (decompLength [c2] == 1 &&
2497 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2498 switch (decompType [c2]) {
2499 case DecompositionCompat:
2500 AddCharMap ((char) c2, category, updateCount, level2);
2506 if (vertical != char.MinValue)
2507 // SPECIAL CASE excluded (FIXME: why?)
2508 if (vertical != '\uFE33' && vertical != '\uFE34')
2509 AddCharMap (vertical, category, updateCount, level2);
2512 char ToFullWidth (char c)
2514 return ToDecomposed (c, DecompositionFull, false);
2517 char ToFullWidthTail (char c)
2519 return ToDecomposed (c, DecompositionFull, true);
2522 char ToSmallForm (char c)
2524 return ToDecomposed (c, DecompositionSmall, false);
2527 char ToSmallFormTail (char c)
2529 return ToDecomposed (c, DecompositionSmall, true);
2532 char ToDecomposed (char c, byte d, bool tail)
2534 if (decompType [(int) c] != d)
2536 int idx = decompIndex [(int) c];
2538 idx += decompLength [(int) c] - 1;
2539 return (char) decompValues [idx];
2542 bool ExistsJIS (int cp)
2544 foreach (JISCharacter j in jisJapanese)
2552 #region Level 3 properties (Case/Width)
2554 private byte ComputeLevel3Weight (char c)
2556 byte b = ComputeLevel3WeightRaw (c);
2557 return b > 0 ? (byte) (b + 2) : b;
2560 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2563 if ('\u11A8' <= c && c <= '\u11F9')
2565 if ('\uFFA0' <= c && c <= '\uFFDC')
2567 if ('\u3130' <= c && c <= '\u3164')
2570 if ('\u2776' <= c && c <= '\u277F')
2572 if ('\u2780' <= c && c <= '\u2789')
2574 if ('\u2776' <= c && c <= '\u2793')
2576 if ('\u2160' <= c && c <= '\u216F')
2578 if ('\u2181' <= c && c <= '\u2182')
2581 if ('\u2135' <= c && c <= '\u2138')
2583 if ('\uFE80' <= c && c < '\uFE8E') {
2584 // 2(Isolated)/8(Final)/0x18(Medial)
2585 switch (decompType [(int) c]) {
2586 case DecompositionIsolated:
2588 case DecompositionFinal:
2590 case DecompositionMedial:
2595 // actually I dunno the reason why they have weights.
2618 switch (decompType [(int) c]) {
2619 case DecompositionWide: // <wide>
2620 case DecompositionSub: // <sub>
2621 case DecompositionSuper: // <super>
2622 ret |= decompType [(int) c];
2625 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2627 if (isUppercase [(int) c]) // DerivedCoreProperties
2637 static bool IsIgnorable (int i)
2639 if (unicodeAge [i] >= 3.1)
2641 switch (char.GetUnicodeCategory ((char) i)) {
2642 case UnicodeCategory.OtherNotAssigned:
2643 case UnicodeCategory.Format:
2650 // FIXME: In the future use DerivedAge.txt to examine character
2651 // versions and set those ones that have higher version than
2652 // 1.0 as ignorable.
2653 static bool IsIgnorable (int i)
2657 // I guess, those characters are added between
2658 // Unicode 1.0 (LCMapString) and Unicode 3.1
2659 // (UnicodeCategory), so they used to be
2660 // something like OtherNotAssigned as of Unicode 1.1.
2661 case 0x2df: case 0x387:
2662 case 0x3d7: case 0x3d8: case 0x3d9:
2663 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2664 case 0x400: case 0x40d: case 0x450: case 0x45d:
2665 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2666 case 0x653: case 0x654: case 0x655: case 0x66d:
2668 case 0x1e9b: case 0x202f: case 0x20ad:
2669 case 0x20ae: case 0x20af:
2670 case 0x20e2: case 0x20e3:
2671 case 0x2139: case 0x213a: case 0x2183:
2672 case 0x2425: case 0x2426: case 0x2619:
2673 case 0x2670: case 0x2671: case 0x3007:
2674 case 0x3190: case 0x3191:
2675 case 0xfffc: case 0xfffd:
2677 // exceptional characters filtered by the
2678 // following conditions. Originally those exceptional
2679 // ranges are incorrect (they should not be ignored)
2680 // and most of those characters are unfortunately in
2682 case 0x4d8: case 0x4d9:
2683 case 0x4e8: case 0x4e9:
2684 case 0x3036: case 0x303f:
2685 case 0x337b: case 0xfb1e:
2690 // The whole Sinhala characters.
2691 0x0D82 <= i && i <= 0x0DF4
2692 // The whole Tibetan characters.
2693 || 0x0F00 <= i && i <= 0x0FD1
2694 // The whole Myanmar characters.
2695 || 0x1000 <= i && i <= 0x1059
2696 // The whole Etiopic, Cherokee,
2697 // Canadian Syllablic, Ogham, Runic,
2698 // Tagalog, Hanunoo, Philippine,
2699 // Buhid, Tagbanwa, Khmer and Mongorian
2701 || 0x1200 <= i && i <= 0x1DFF
2702 // Greek extension characters.
2703 || 0x1F00 <= i && i <= 0x1FFF
2704 // The whole Braille characters.
2705 || 0x2800 <= i && i <= 0x28FF
2706 // CJK radical characters.
2707 || 0x2E80 <= i && i <= 0x2EF3
2708 // Kangxi radical characters.
2709 || 0x2F00 <= i && i <= 0x2FD5
2710 // Ideographic description characters.
2711 || 0x2FF0 <= i && i <= 0x2FFB
2712 // Bopomofo letter and final
2713 || 0x31A0 <= i && i <= 0x31B7
2714 // White square with quadrant characters.
2715 || 0x25F0 <= i && i <= 0x25F7
2716 // Ideographic telegraph symbols.
2717 || 0x32C0 <= i && i <= 0x32CB
2718 || 0x3358 <= i && i <= 0x3370
2719 || 0x33E0 <= i && i <= 0x33FF
2720 // The whole YI characters.
2721 || 0xA000 <= i && i <= 0xA48C
2722 || 0xA490 <= i && i <= 0xA4C6
2723 // American small ligatures
2724 || 0xFB13 <= i && i <= 0xFB17
2725 // hebrew, arabic, variation selector.
2726 || 0xFB1D <= i && i <= 0xFE2F
2727 // Arabic ligatures.
2728 || 0xFEF5 <= i && i <= 0xFEFC
2729 // FIXME: why are they excluded?
2730 || 0x01F6 <= i && i <= 0x01F9
2731 || 0x0218 <= i && i <= 0x0233
2732 || 0x02A9 <= i && i <= 0x02AD
2733 || 0x02EA <= i && i <= 0x02EE
2734 || 0x0349 <= i && i <= 0x036F
2735 || 0x0488 <= i && i <= 0x048F
2736 || 0x04D0 <= i && i <= 0x04FF
2737 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2738 || 0x06D6 <= i && i <= 0x06ED
2739 || 0x06FA <= i && i <= 0x06FE
2740 || 0x2048 <= i && i <= 0x204D
2741 || 0x20e4 <= i && i <= 0x20ea
2742 || 0x213C <= i && i <= 0x214B
2743 || 0x21EB <= i && i <= 0x21FF
2744 || 0x22F2 <= i && i <= 0x22FF
2745 || 0x237B <= i && i <= 0x239A
2746 || 0x239B <= i && i <= 0x23CF
2747 || 0x24EB <= i && i <= 0x24FF
2748 || 0x2596 <= i && i <= 0x259F
2749 || 0x25F8 <= i && i <= 0x25FF
2750 || 0x2672 <= i && i <= 0x2689
2751 || 0x2768 <= i && i <= 0x2775
2752 || 0x27d0 <= i && i <= 0x27ff
2753 || 0x2900 <= i && i <= 0x2aff
2754 || 0x3033 <= i && i <= 0x303F
2755 || 0x31F0 <= i && i <= 0x31FF
2756 || 0x3250 <= i && i <= 0x325F
2757 || 0x32B1 <= i && i <= 0x32BF
2758 || 0x3371 <= i && i <= 0x337B
2759 || 0xFA30 <= i && i <= 0xFA6A
2763 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2765 case UnicodeCategory.PrivateUse:
2766 case UnicodeCategory.Surrogate:
2768 // ignored by nature
2769 case UnicodeCategory.Format:
2770 case UnicodeCategory.OtherNotAssigned:
2777 // To check IsIgnorable sanity, try the driver below under MS.NET.
2780 public static void Main ()
2782 for (int i = 0; i <= char.MaxValue; i++)
2783 Dump (i, IsIgnorable (i));
2786 static void Dump (int i, bool ignore)
2788 switch (Char.GetUnicodeCategory ((char) i)) {
2789 case UnicodeCategory.PrivateUse:
2790 case UnicodeCategory.Surrogate:
2791 return; // check nothing
2795 string s2 = new string ((char) i, 10);
2796 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2797 if ((ret == 0) == ignore)
2799 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2802 #endregion // IsIgnorable
2804 #region IsIgnorableSymbol
2805 static bool IsIgnorableSymbol (int i)
2807 if (IsIgnorable (i))
2812 case 0x00b5: case 0x01C0: case 0x01C1:
2813 case 0x01C2: case 0x01C3: case 0x01F6:
2814 case 0x01F7: case 0x01F8: case 0x01F9:
2815 case 0x02D0: case 0x02EE: case 0x037A:
2816 case 0x03D7: case 0x03F3:
2817 case 0x0400: case 0x040d:
2818 case 0x0450: case 0x045d:
2819 case 0x048C: case 0x048D:
2820 case 0x048E: case 0x048F:
2821 case 0x0587: case 0x0640: case 0x06E5:
2822 case 0x06E6: case 0x06FA: case 0x06FB:
2823 case 0x06FC: case 0x093D: case 0x0950:
2824 case 0x1E9B: case 0x2139: case 0x3006:
2825 case 0x3033: case 0x3034: case 0x3035:
2826 case 0xFE7E: case 0xFE7F:
2828 case 0x16EE: case 0x16EF: case 0x16F0:
2830 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2831 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2832 case 0x3038: // HANGZHOU NUMERAL TEN
2833 case 0x3039: // HANGZHOU NUMERAL TWENTY
2834 case 0x303a: // HANGZHOU NUMERAL THIRTY
2840 case 0x02B9: case 0x02BA: case 0x02C2:
2841 case 0x02C3: case 0x02C4: case 0x02C5:
2842 case 0x02C8: case 0x02CC: case 0x02CD:
2843 case 0x02CE: case 0x02CF: case 0x02D2:
2844 case 0x02D3: case 0x02D4: case 0x02D5:
2845 case 0x02D6: case 0x02D7: case 0x02DE:
2846 case 0x02E5: case 0x02E6: case 0x02E7:
2847 case 0x02E8: case 0x02E9:
2848 case 0x309B: case 0x309C:
2850 case 0x055A: // American Apos
2851 case 0x05C0: // Hebrew Punct
2852 case 0x0E4F: // Thai FONGMAN
2853 case 0x0E5A: // Thai ANGKHANKHU
2854 case 0x0E5B: // Thai KHOMUT
2856 case 0x09F2: // Bengali Rupee Mark
2857 case 0x09F3: // Bengali Rupee Sign
2859 case 0x221e: // INF.
2868 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2870 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2871 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2876 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2878 case UnicodeCategory.Surrogate:
2879 return false; // inconsistent
2881 case UnicodeCategory.SpacingCombiningMark:
2882 case UnicodeCategory.EnclosingMark:
2883 case UnicodeCategory.NonSpacingMark:
2884 case UnicodeCategory.PrivateUse:
2886 if (0x064B <= i && i <= 0x0652) // Arabic
2890 case UnicodeCategory.Format:
2891 case UnicodeCategory.OtherNotAssigned:
2898 // latin in a circle
2899 0x249A <= i && i <= 0x24E9
2900 || 0x2100 <= i && i <= 0x2132
2902 || 0x3196 <= i && i <= 0x31A0
2904 || 0x3200 <= i && i <= 0x321C
2906 || 0x322A <= i && i <= 0x3243
2908 || 0x3260 <= i && i <= 0x32B0
2909 || 0x32D0 <= i && i <= 0x3357
2910 || 0x337B <= i && i <= 0x33DD
2912 use = !Char.IsLetterOrDigit ((char) i);
2916 // This "Digit" rule is mystery.
2917 // It filters some symbols out.
2918 if (Char.IsLetterOrDigit ((char) i))
2920 if (Char.IsNumber ((char) i))
2922 if (Char.IsControl ((char) i)
2923 || Char.IsSeparator ((char) i)
2924 || Char.IsPunctuation ((char) i))
2926 if (Char.IsSymbol ((char) i))
2929 // FIXME: should check more
2934 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2936 public static void Main ()
2938 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2939 for (int i = 0; i <= char.MaxValue; i++) {
2940 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2941 if (uc == UnicodeCategory.Surrogate)
2944 bool ret = IsIgnorableSymbol (i);
2946 string s1 = "TEST ";
2947 string s2 = "TEST " + (char) i;
2949 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2951 if (ret != (result == 0))
2952 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2953 ret ? "should not ignore" :
2962 static bool IsIgnorableNonSpacing (int i)
2964 if (IsIgnorable (i))
2968 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2969 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2970 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2972 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2973 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2974 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2975 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2976 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2977 case 0x0CCD: case 0x0E4E:
2981 if (0x02b9 <= i && i <= 0x02c5
2982 || 0x02cc <= i && i <= 0x02d7
2983 || 0x02e4 <= i && i <= 0x02ef
2984 || 0x20DD <= i && i <= 0x20E0
2988 if (0x064B <= i && i <= 0x00652
2989 || 0x0941 <= i && i <= 0x0948
2990 || 0x0AC1 <= i && i <= 0x0ACD
2991 || 0x0C3E <= i && i <= 0x0C4F
2992 || 0x0E31 <= i && i <= 0x0E3F
2996 return Char.GetUnicodeCategory ((char) i) ==
2997 UnicodeCategory.NonSpacingMark;
3000 // We can reuse IsIgnorableSymbol testcode
3001 // for IsIgnorableNonSpacing.
3007 public byte Category;
3009 public byte Level2; // It is always single byte.
3010 public bool Defined;
3012 public CharMapEntry (byte category, byte level1, byte level2)
3014 Category = category;
3023 public readonly int CP;
3024 public readonly int JIS;
3026 public JISCharacter (int cp, int cpJIS)
3033 class JISComparer : IComparer
3035 public static readonly JISComparer Instance =
3038 public int Compare (object o1, object o2)
3040 JISCharacter j1 = (JISCharacter) o1;
3041 JISCharacter j2 = (JISCharacter) o2;
3042 return j2.JIS - j1.JIS;
3046 class NonJISCharacter
3048 public readonly int CP;
3049 public readonly string Name;
3051 public NonJISCharacter (int cp, string name)
3058 class NonJISComparer : IComparer
3060 public static readonly NonJISComparer Instance =
3061 new NonJISComparer ();
3063 public int Compare (object o1, object o2)
3065 NonJISCharacter j1 = (NonJISCharacter) o1;
3066 NonJISCharacter j2 = (NonJISCharacter) o2;
3067 return string.CompareOrdinal (j1.Name, j2.Name);
3071 class DecimalDictionaryValueComparer : IComparer
3073 public static readonly DecimalDictionaryValueComparer Instance
3074 = new DecimalDictionaryValueComparer ();
3076 private DecimalDictionaryValueComparer ()
3080 public int Compare (object o1, object o2)
3082 DictionaryEntry e1 = (DictionaryEntry) o1;
3083 DictionaryEntry e2 = (DictionaryEntry) o2;
3084 // FIXME: in case of 0, compare decomposition categories
3085 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3088 int i1 = (int) e1.Key;
3089 int i2 = (int) e2.Key;
3094 class StringDictionaryValueComparer : IComparer
3096 public static readonly StringDictionaryValueComparer Instance
3097 = new StringDictionaryValueComparer ();
3099 private StringDictionaryValueComparer ()
3103 public int Compare (object o1, object o2)
3105 DictionaryEntry e1 = (DictionaryEntry) o1;
3106 DictionaryEntry e2 = (DictionaryEntry) o2;
3107 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3110 int i1 = (int) e1.Key;
3111 int i2 = (int) e2.Key;
3116 class UCAComparer : IComparer
3118 public static readonly UCAComparer Instance
3119 = new UCAComparer ();
3121 private UCAComparer ()
3125 public int Compare (object o1, object o2)
3127 char i1 = (char) o1;
3128 char i2 = (char) o2;
3130 int l1 = CollationElementTable.GetSortKeyCount (i1);
3131 int l2 = CollationElementTable.GetSortKeyCount (i2);
3132 int l = l1 > l2 ? l2 : l1;
3134 for (int i = 0; i < l; i++) {
3135 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3136 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3137 int v = k1.Primary - k2.Primary;
3140 v = k1.Secondary - k2.Secondary;
3143 v = k1.Thirtiary - k2.Thirtiary;
3146 v = k1.Quarternary - k2.Quarternary;
3159 ArrayList items = new ArrayList ();
3161 public Tailoring (int lcid)
3166 public Tailoring (int lcid, int alias)
3173 get { return lcid; }
3177 get { return alias; }
3180 public bool FrenchSort {
3181 get { return frenchSort; }
3182 set { frenchSort = value; }
3185 public void AddDiacriticalMap (byte target, byte replace)
3187 items.Add (new DiacriticalMap (target, replace));
3190 public void AddSortKeyMap (string source, byte [] sortkey)
3192 items.Add (new SortKeyMap (source, sortkey));
3195 public void AddReplacementMap (string source, string replace)
3197 items.Add (new ReplacementMap (source, replace));
3200 public char [] ItemToCharArray ()
3202 ArrayList al = new ArrayList ();
3203 foreach (ITailoringMap m in items)
3204 al.AddRange (m.ToCharArray ());
3205 return al.ToArray (typeof (char)) as char [];
3208 interface ITailoringMap
3210 char [] ToCharArray ();
3213 class DiacriticalMap : ITailoringMap
3215 public readonly byte Target;
3216 public readonly byte Replace;
3218 public DiacriticalMap (byte target, byte replace)
3224 public char [] ToCharArray ()
3226 char [] ret = new char [3];
3227 ret [0] = (char) 02; // kind:DiacriticalMap
3228 ret [1] = (char) Target;
3229 ret [2] = (char) Replace;
3234 class SortKeyMap : ITailoringMap
3236 public readonly string Source;
3237 public readonly byte [] SortKey;
3239 public SortKeyMap (string source, byte [] sortkey)
3245 public char [] ToCharArray ()
3247 char [] ret = new char [Source.Length + 7];
3248 ret [0] = (char) 01; // kind:SortKeyMap
3249 for (int i = 0; i < Source.Length; i++)
3250 ret [i + 1] = Source [i];
3252 for (int i = 0; i < 5; i++)
3253 ret [i + Source.Length + 2] = (char) SortKey [i];
3258 class ReplacementMap : ITailoringMap
3260 public readonly string Source;
3261 public readonly string Replace;
3263 public ReplacementMap (string source, string replace)
3269 public char [] ToCharArray ()
3271 char [] ret = new char [Source.Length + Replace.Length + 3];
3272 ret [0] = (char) 03; // kind:ReplaceMap
3274 for (int i = 0; i < Source.Length; i++)
3275 ret [pos++] = Source [i];
3278 for (int i = 0; i < Replace.Length; i++)
3279 ret [pos++] = Replace [i];