3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
35 namespace Mono.Globalization.Unicode
37 internal class MSCompatSortKeyTableGenerator
39 public static void Main (string [] args)
41 new MSCompatSortKeyTableGenerator ().Run (args);
44 const int DecompositionWide = 1; // fixed
45 const int DecompositionSub = 2; // fixed
46 const int DecompositionSmall = 3;
47 const int DecompositionIsolated = 4;
48 const int DecompositionInitial = 5;
49 const int DecompositionFinal = 6;
50 const int DecompositionMedial = 7;
51 const int DecompositionNoBreak = 8;
52 const int DecompositionVertical = 9;
53 const int DecompositionFraction = 0xA;
54 const int DecompositionFont = 0xB;
55 const int DecompositionSuper = 0xC; // fixed
56 const int DecompositionFull = 0xE;
57 const int DecompositionNarrow = 0xD;
58 const int DecompositionCircle = 0xF;
59 const int DecompositionSquare = 0x10;
60 const int DecompositionCompat = 0x11;
61 const int DecompositionCanonical = 0x12;
63 TextWriter Result = Console.Out;
65 byte [] fillIndex = new byte [256]; // by category
66 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68 char [] specialIgnore = new char [] {
69 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
73 // FIXME: need more love (as always)
74 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77 '\u0292', '\u01BE', '\u0298'};
78 byte [] alphaWeights = new byte [] {
79 2, 9, 0xA, 0x1A, 0x21,
80 0x23, 0x25, 0x2C, 0x32, 0x35,
81 0x36, 0x48, 0x51, 0x70, 0x7C,
82 0x7E, 0x89, 0x8A, 0x91, 0x99,
83 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84 0xA9, 0xAA, 0xB3, 0xB4};
86 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87 bool [] isUppercase = new bool [char.MaxValue + 1];
89 byte [] decompType = new byte [char.MaxValue + 1];
90 int [] decompIndex = new int [char.MaxValue + 1];
91 int [] decompLength = new int [char.MaxValue + 1];
93 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95 byte [] diacritical = new byte [char.MaxValue + 1];
97 string [] diacritics = new string [] {
99 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102 " OGONEK;", " CEDILLA;",
103 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104 " STROKE;", " CIRCUMFLEX AND ACUTE;",
105 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106 " DIAERESIS AND GRAVE;",
108 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109 " MACRON AND ACUTE;",
110 " MACRON AND GRAVE;",
111 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112 " RING ABOVE AND ACUTE",
113 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114 " CIRCUMFLEX AND TILDE",
115 " TILDE AND DIAERESIS",
118 " CEDILLA AND BREVE",
119 " OGONEK AND MACRON",
120 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
123 " PRECEDED BY APOSTROPHE",
125 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
128 " RETROFLEX;", "DIAERESIS BELOW",
130 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131 " BREVE BELOW;", " HORN AND GRAVE",
133 " DOT BELOW AND DOT ABOVE",
134 " RIGHT HALF RING", " HORN AND TILDE",
135 " CIRCUMFLEX AND DOT BELOW",
136 " BREVE AND DOT BELOW",
137 " DOT BELOW AND MACRON",
138 " HORN AND HOOK ABOVE",
140 // CIRCLED, PARENTHESIZED and so on
141 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
144 byte [] diacriticWeights = new byte [] {
146 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147 0x17, 0x19, 0x1A, 0x1B, 0x1C,
148 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
155 0x69, 0x69, 0x6A, 0x6D, 0x6E,
157 // CIRCLED, PARENTHESIZED and so on.
158 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
161 int [] numberSecondaryWeightBounds = new int [] {
162 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166 0xE50, 0xE60, 0xED0, 0xEE0
169 char [] orderedCyrillic;
170 char [] orderedGurmukhi;
171 char [] orderedGujarati;
172 char [] orderedGeorgian;
173 char [] orderedThaana;
175 static readonly char [] orderedTamilConsonants = new char [] {
176 // based on traditional Tamil consonants, except for
177 // Grantha (where Microsoft breaks traditionalism).
178 // http://www.angelfire.com/empire/thamizh/padanGaL
179 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
185 // cp -> character name (only for some characters)
186 ArrayList sortableCharNames = new ArrayList ();
188 // cp -> arrow value (int)
189 ArrayList arrowValues = new ArrayList ();
191 // cp -> box value (int)
192 ArrayList boxValues = new ArrayList ();
194 // cp -> level1 value
195 Hashtable arabicLetterPrimaryValues = new Hashtable ();
198 Hashtable arabicNameMap = new Hashtable ();
200 // cp -> Hashtable [decompType] -> cp
201 Hashtable nfkdMap = new Hashtable ();
203 // Latin letter -> ArrayList [int]
204 Hashtable latinMap = new Hashtable ();
206 ArrayList jisJapanese = new ArrayList ();
207 ArrayList nonJisJapanese = new ArrayList ();
209 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
210 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
211 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
212 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
213 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
215 byte [] ignorableFlags = new byte [char.MaxValue + 1];
217 static double [] unicodeAge = new double [char.MaxValue + 1];
219 ArrayList tailorings = new ArrayList ();
221 void Run (string [] args)
223 string dirname = args.Length == 0 ? "downloaded" : args [0];
224 ParseSources (dirname);
225 Console.Error.WriteLine ("parse done.");
227 ModifyParsedValues ();
229 Console.Error.WriteLine ("generation done.");
231 Console.Error.WriteLine ("serialization done.");
233 StreamWriter sw = new StreamWriter ("agelog.txt");
234 for (int i = 0; i < char.MaxValue; i++) {
235 bool shouldBe = false;
236 switch (Char.GetUnicodeCategory ((char) i)) {
237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
238 shouldBe = true; break;
240 if (unicodeAge [i] >= 3.1)
242 //if (IsIgnorable (i) != shouldBe)
243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
249 byte [] CompressArray (byte [] source, CodePointIndexer i)
251 return (byte []) CodePointIndexer.CompressArray (
252 source, typeof (byte), i);
255 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
257 return (ushort []) CodePointIndexer.CompressArray (
258 source, typeof (ushort), i);
264 SerializeTailorings ();
266 byte [] categories = new byte [map.Length];
267 byte [] level1 = new byte [map.Length];
268 byte [] level2 = new byte [map.Length];
269 byte [] level3 = new byte [map.Length];
270 int [] widthCompat = new int [map.Length];
271 for (int i = 0; i < map.Length; i++) {
272 categories [i] = map [i].Category;
273 level1 [i] = map [i].Level1;
274 level2 [i] = map [i].Level2;
275 level3 [i] = ComputeLevel3Weight ((char) i);
276 switch (decompType [i]) {
277 case DecompositionNarrow:
278 case DecompositionWide:
279 case DecompositionSuper:
280 case DecompositionSub:
281 // they are always 1 char
282 widthCompat [i] = decompValues [decompIndex [i]];
288 ignorableFlags = CompressArray (ignorableFlags,
289 MSCompatUnicodeTableUtil.Ignorable);
290 categories = CompressArray (categories,
291 MSCompatUnicodeTableUtil.Category);
292 level1 = CompressArray (level1,
293 MSCompatUnicodeTableUtil.Level1);
294 level2 = CompressArray (level2,
295 MSCompatUnicodeTableUtil.Level2);
296 level3 = CompressArray (level3,
297 MSCompatUnicodeTableUtil.Level3);
298 widthCompat = (int []) CodePointIndexer.CompressArray (
299 widthCompat, typeof (int),
300 MSCompatUnicodeTableUtil.WidthCompat);
301 cjkCHS = CompressArray (cjkCHS,
302 MSCompatUnicodeTableUtil.CjkCHS);
303 cjkCHT = CompressArray (cjkCHT,
304 MSCompatUnicodeTableUtil.Cjk);
305 cjkJA = CompressArray (cjkJA,
306 MSCompatUnicodeTableUtil.Cjk);
307 cjkKO = CompressArray (cjkKO,
308 MSCompatUnicodeTableUtil.Cjk);
309 cjkKOlv2 = CompressArray (cjkKOlv2,
310 MSCompatUnicodeTableUtil.Cjk);
313 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
314 for (int i = 0; i < ignorableFlags.Length; i++) {
315 byte value = ignorableFlags [i];
317 Result.Write ("{0},", value);
319 Result.Write ("0x{0:X02},", value);
320 if ((i & 0xF) == 0xF)
321 Result.WriteLine ("// {0:X04}", i - 0xF);
323 Result.WriteLine ("};");
327 Result.WriteLine ("static byte [] categories = new byte [] {");
328 for (int i = 0; i < categories.Length; i++) {
329 byte value = categories [i];
331 Result.Write ("{0},", value);
333 Result.Write ("0x{0:X02},", value);
334 if ((i & 0xF) == 0xF)
335 Result.WriteLine ("// {0:X04}", i - 0xF);
337 Result.WriteLine ("};");
340 // Primary weight value
341 Result.WriteLine ("static byte [] level1 = new byte [] {");
342 for (int i = 0; i < level1.Length; i++) {
343 byte value = level1 [i];
345 Result.Write ("{0},", value);
347 Result.Write ("0x{0:X02},", value);
348 if ((i & 0xF) == 0xF)
349 Result.WriteLine ("// {0:X04}", i - 0xF);
351 Result.WriteLine ("};");
355 Result.WriteLine ("static byte [] level2 = new byte [] {");
356 for (int i = 0; i < level2.Length; i++) {
357 int value = level2 [i];
359 Result.Write ("{0},", value);
361 Result.Write ("0x{0:X02},", value);
362 if ((i & 0xF) == 0xF)
363 Result.WriteLine ("// {0:X04}", i - 0xF);
365 Result.WriteLine ("};");
369 Result.WriteLine ("static byte [] level3 = new byte [] {");
370 for (int i = 0; i < level3.Length; i++) {
371 byte value = level3 [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
376 if ((i & 0xF) == 0xF)
377 Result.WriteLine ("// {0:X04}", i - 0xF);
379 Result.WriteLine ("};");
382 // Width insensitivity mappings
383 // (for now it is more lightweight than dumping the
384 // entire NFKD table).
385 Result.WriteLine ("static int [] widthCompat = new int [] {");
386 for (int i = 0; i < widthCompat.Length; i++) {
387 int value = widthCompat [i];
389 Result.Write ("{0},", value);
391 Result.Write ("0x{0:X02},", value);
392 if ((i & 0xF) == 0xF)
393 Result.WriteLine ("// {0:X04}", i - 0xF);
395 Result.WriteLine ("};");
399 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
400 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
401 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
402 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
403 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
406 void SerializeCJK (string name, ushort [] cjk, int max)
408 int offset = 0;//char.MaxValue - cjk.Length;
409 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
410 for (int i = 0; i < cjk.Length; i++) {
411 if (i + offset == max)
413 ushort value = cjk [i];
415 Result.Write ("{0},", value);
417 Result.Write ("0x{0:X04},", value);
418 if ((i & 0xF) == 0xF)
419 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
421 Result.WriteLine ("};");
425 void SerializeCJK (string name, byte [] cjk, int max)
427 int offset = 0;//char.MaxValue - cjk.Length;
428 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
429 for (int i = 0; i < cjk.Length; i++) {
430 if (i + offset == max)
432 byte value = cjk [i];
434 Result.Write ("{0},", value);
436 Result.Write ("0x{0:X02},", value);
437 if ((i & 0xF) == 0xF)
438 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
440 Result.WriteLine ("};");
444 void SerializeTailorings ()
446 Hashtable indexes = new Hashtable ();
447 Hashtable counts = new Hashtable ();
448 Result.WriteLine ("static char [] tailorings = new char [] {");
450 foreach (Tailoring t in tailorings) {
453 Result.Write ("/*{0}*/", t.LCID);
454 indexes.Add (t.LCID, count);
455 char [] values = t.ItemToCharArray ();
456 counts.Add (t.LCID, values.Length);
457 foreach (char c in values) {
458 Result.Write ("'\\x{0:X}', ", (int) c);
459 if (++count % 16 == 0)
460 Result.WriteLine (" // {0:X04}", count - 16);
463 Result.WriteLine ("};");
465 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
466 foreach (Tailoring t in tailorings) {
467 int target = t.Alias != 0 ? t.Alias : t.LCID;
468 if (!indexes.ContainsKey (target)) {
469 Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
472 int idx = (int) indexes [target];
473 int cnt = (int) counts [target];
474 bool french = t.FrenchSort;
476 foreach (Tailoring t2 in tailorings)
477 if (t2.LCID == t.LCID)
478 french = t2.FrenchSort;
479 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
481 Result.WriteLine ("};");
486 void ParseSources (string dirname)
489 dirname + "/UnicodeData.txt";
490 string derivedCoreProps =
491 dirname + "/DerivedCoreProperties.txt";
493 dirname + "/Scripts.txt";
495 dirname + "/CP932.TXT";
497 dirname + "/DerivedAge.txt";
498 string chXML = dirname + "/common/collation/zh.xml";
499 string jaXML = dirname + "/common/collation/ja.xml";
500 string koXML = dirname + "/common/collation/ko.xml";
502 ParseDerivedAge (derivedAge);
506 ParseJISOrder (cp932); // in prior to ParseUnidata()
507 ParseUnidata (unidata);
508 ParseDerivedCoreProperties (derivedCoreProps);
509 ParseScripts (scripts);
510 ParseCJK (chXML, jaXML, koXML);
512 ParseTailorings ("mono-tailoring-source.txt");
515 void ParseTailorings (string filename)
519 using (StreamReader sr = new StreamReader (filename)) {
521 while (sr.Peek () >= 0) {
523 ProcessTailoringLine (ref t,
524 sr.ReadLine ().Trim ());
526 } catch (Exception) {
527 Console.Error.WriteLine ("ERROR at line {0}", line);
533 // For now this is enough.
534 string ParseTailoringSourceValue (string s)
536 StringBuilder sb = new StringBuilder ();
537 for (int i = 0; i < s.Length; i++) {
538 if (s.StartsWith ("\\u")) {
539 sb.Append ((char) int.Parse (
540 s.Substring (2, 4), NumberStyles.HexNumber),
547 return sb.ToString ();
550 void ProcessTailoringLine (ref Tailoring t, string s)
552 int idx = s.IndexOf ('#');
554 s = s.Substring (0, idx).Trim ();
555 if (s.Length == 0 || s [0] == '#')
558 idx = s.IndexOf ('=');
561 int.Parse (s.Substring (1, idx - 1)),
562 int.Parse (s.Substring (idx + 1)));
564 t = new Tailoring (int.Parse (s.Substring (1)));
568 if (s.StartsWith ("*FrenchSort")) {
572 string d = "*Diacritical";
573 if (s.StartsWith (d)) {
574 idx = s.IndexOf ("->");
575 t.AddDiacriticalMap (
576 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
577 NumberStyles.HexNumber),
578 byte.Parse (s.Substring (idx + 2).Trim (),
579 NumberStyles.HexNumber));
582 idx = s.IndexOf (':');
584 string source = s.Substring (0, idx).Trim ();
585 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
586 byte [] b = new byte [5];
587 for (int i = 0; i < 5; i++) {
591 b [i] = byte.Parse (l [i],
592 NumberStyles.HexNumber);
594 t.AddSortKeyMap (ParseTailoringSourceValue (source),
597 idx = s.IndexOf ('=');
599 t.AddReplacementMap (
600 ParseTailoringSourceValue (
601 s.Substring (0, idx).Trim ()),
602 ParseTailoringSourceValue (
603 s.Substring (idx + 1).Trim ()));
606 void ParseDerivedAge (string filename)
608 using (StreamReader file =
609 new StreamReader (filename)) {
610 while (file.Peek () >= 0) {
611 string s = file.ReadLine ();
612 int idx = s.IndexOf ('#');
614 s = s.Substring (0, idx);
615 idx = s.IndexOf (';');
619 string cpspec = s.Substring (0, idx);
620 idx = cpspec.IndexOf ("..");
621 NumberStyles nf = NumberStyles.HexNumber |
622 NumberStyles.AllowTrailingWhite;
623 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
624 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
625 string value = s.Substring (cpspec.Length + 1).Trim ();
628 if (cp > char.MaxValue)
631 for (int i = cp; i <= cpEnd; i++)
632 unicodeAge [i] = double.Parse (value);
635 unicodeAge [0] = double.MaxValue; // never be supported
638 void ParseUnidata (string filename)
640 ArrayList decompValues = new ArrayList ();
641 using (StreamReader unidata =
642 new StreamReader (filename)) {
643 for (int line = 1; unidata.Peek () >= 0; line++) {
645 ProcessUnidataLine (unidata.ReadLine (), decompValues);
646 } catch (Exception) {
647 Console.Error.WriteLine ("**** At line " + line);
652 this.decompValues = (int [])
653 decompValues.ToArray (typeof (int));
656 void ProcessUnidataLine (string s, ArrayList decompValues)
658 int idx = s.IndexOf ('#');
660 s = s.Substring (0, idx);
661 idx = s.IndexOf (';');
664 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
665 string [] values = s.Substring (idx + 1).Split (';');
668 if (cp > char.MaxValue)
670 if (IsIgnorable (cp))
673 string name = values [0];
676 if (s.IndexOf ("SMALL CAPITAL") > 0)
677 isSmallCapital [cp] = true;
679 // latin mapping by character name
680 if (s.IndexOf ("LATIN") > 0) {
681 int lidx = s.IndexOf ("LETTER DOTLESS ");
682 int offset = lidx + 15;
684 lidx = s.IndexOf ("LETTER TURNED ");
688 lidx = s.IndexOf ("LETTER ");
691 char c = lidx > 0 ? s [offset] : char.MinValue;
692 if ('A' <= c && c <= 'Z' &&
693 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
694 ArrayList entry = (ArrayList) latinMap [c];
696 entry = new ArrayList ();
697 latinMap [c] = entry;
704 if (0x2000 <= cp && cp < 0x3000) {
706 // SPECIAL CASES. FIXME: why?
708 case 0x21C5: value = -1; break; // E2
709 case 0x261D: value = 1; break;
710 case 0x27A6: value = 3; break;
711 case 0x21B0: value = 7; break;
712 case 0x21B1: value = 3; break;
713 case 0x21B2: value = 7; break;
714 case 0x21B4: value = 5; break;
715 case 0x21B5: value = 7; break;
716 case 0x21B9: value = -1; break; // E1
717 case 0x21CF: value = 7; break;
718 case 0x21D0: value = 3; break;
720 string [] arrowTargets = new string [] {
732 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
733 if (s.IndexOf (arrowTargets [i]) > 0 &&
734 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
735 s.IndexOf (" OVER") < 0
739 arrowValues.Add (new DictionaryEntry (
744 if (0x2500 <= cp && cp < 0x25B0) {
747 // up:1 down:2 right:4 left:8 vert:16 horiz:32
750 // [dr] [dl] [ur] [ul]
754 ArrayList flags = new ArrayList (new int [] {
757 4 + 2, 8 + 2, 4 + 1, 8 + 1,
758 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
759 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
760 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
762 byte [] offsets = new byte [] {
769 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
771 if (s.IndexOf (" UP") > 0)
773 if (s.IndexOf (" DOWN") > 0)
775 if (s.IndexOf (" RIGHT") > 0)
777 if (s.IndexOf (" LEFT") > 0)
779 if (s.IndexOf (" VERTICAL") > 0)
781 if (s.IndexOf (" HORIZONTAL") > 0)
784 int fidx = flags.IndexOf (flag);
785 value = fidx < 0 ? fidx : offsets [fidx];
786 } else if (s.IndexOf ("BLOCK") > 0) {
787 if (s.IndexOf ("ONE EIGHTH") > 0)
789 else if (s.IndexOf ("ONE QUARTER") > 0)
791 else if (s.IndexOf ("THREE EIGHTHS") > 0)
793 else if (s.IndexOf ("HALF") > 0)
795 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
797 else if (s.IndexOf ("THREE QUARTERS") > 0)
799 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
805 boxValues.Add (new DictionaryEntry (
809 // For some characters store the name and sort later
810 // to determine sorting.
811 if (0x2100 <= cp && cp <= 0x213F &&
812 Char.IsSymbol ((char) cp))
813 sortableCharNames.Add (
814 new DictionaryEntry (cp, values [0]));
815 else if (0x3380 <= cp && cp <= 0x33DD)
816 sortableCharNames.Add (new DictionaryEntry (
817 cp, values [0].Substring (7)));
819 // diacritical weights by character name
820 for (int d = 0; d < diacritics.Length; d++)
821 if (s.IndexOf (diacritics [d]) > 0)
822 diacritical [cp] |= diacriticWeights [d];
823 // Two-step grep required for it.
824 if (s.IndexOf ("FULL STOP") > 0 &&
825 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
826 diacritical [cp] |= 0xF4;
828 // Arabic letter name
829 if (0x0621 <= cp && cp <= 0x064A &&
830 Char.GetUnicodeCategory ((char) cp)
831 == UnicodeCategory.OtherLetter) {
832 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
837 // hamza, waw, yeh ... special cases.
842 value = 0x77; // special cases.
845 // Get primary letter name i.e.
846 // XXX part of ARABIC LETTER XXX yyy
847 // e.g. that of "TEH MARBUTA" is "TEH".
850 // 0x0640 is special: it does
851 // not start with ARABIC LETTER
853 values [0].Substring (14);
854 int tmpIdx = letterName.IndexOf (' ');
855 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
856 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
857 if (arabicNameMap.ContainsKey (letterName))
858 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
860 arabicNameMap [letterName] = cp;
863 arabicLetterPrimaryValues [cp] = value;
866 // Japanese square letter
867 if (0x3300 <= cp && cp <= 0x3357)
869 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
872 string decomp = values [4];
873 idx = decomp.IndexOf ('<');
875 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
877 decompType [cp] = DecompositionFull;
880 decompType [cp] = DecompositionSub;
883 decompType [cp] = DecompositionSuper;
886 decompType [cp] = DecompositionSmall;
889 decompType [cp] = DecompositionIsolated;
892 decompType [cp] = DecompositionInitial;
895 decompType [cp] = DecompositionFinal;
898 decompType [cp] = DecompositionMedial;
901 decompType [cp] = DecompositionNoBreak;
904 decompType [cp] = DecompositionCompat;
907 decompType [cp] = DecompositionFraction;
910 decompType [cp] = DecompositionFont;
913 decompType [cp] = DecompositionCircle;
916 decompType [cp] = DecompositionSquare;
919 decompType [cp] = DecompositionWide;
922 decompType [cp] = DecompositionNarrow;
925 decompType [cp] = DecompositionVertical;
928 throw new Exception ("Support NFKD type : " + decomp);
932 decompType [cp] = DecompositionCanonical;
933 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
934 if (decomp.Length > 0) {
936 string [] velems = decomp.Split (' ');
937 int didx = decompValues.Count;
938 decompIndex [cp] = didx;
939 foreach (string v in velems)
940 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
941 decompLength [cp] = velems.Length;
943 // [decmpType] -> this_cp
944 int targetCP = (int) decompValues [didx];
945 // for "(x)" it specially maps to 'x' .
946 // FIXME: check if it is sane
947 if (velems.Length == 3 &&
948 (int) decompValues [didx] == '(' &&
949 (int) decompValues [didx + 2] == ')')
950 targetCP = (int) decompValues [didx + 1];
951 // special: 0x215F "1/"
952 else if (cp == 0x215F)
954 else if (velems.Length > 1 &&
955 (targetCP < 0x4C00 || 0x9FBB < targetCP))
956 // skip them, except for CJK ideograph compat
960 Hashtable entry = (Hashtable) nfkdMap [targetCP];
962 entry = new Hashtable ();
963 nfkdMap [targetCP] = entry;
965 entry [(byte) decompType [cp]] = cp;
969 if (values [5].Length > 0)
970 decimalValue [cp] = decimal.Parse (values [5]);
971 else if (values [6].Length > 0)
972 decimalValue [cp] = decimal.Parse (values [6]);
973 else if (values [7].Length > 0) {
974 string decstr = values [7];
975 idx = decstr.IndexOf ('/');
976 if (cp == 0x215F) // special. "1/"
977 decimalValue [cp] = 0x1;
981 decimal.Parse (decstr.Substring (0, idx))
982 / decimal.Parse (decstr.Substring (idx + 1));
983 else if (decstr [0] == '(' &&
984 decstr [decstr.Length - 1] == ')')
987 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
988 else if (decstr [decstr.Length - 1] == '.')
991 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
993 decimalValue [cp] = decimal.Parse (decstr);
997 void ParseDerivedCoreProperties (string filename)
1000 using (StreamReader file =
1001 new StreamReader (filename)) {
1002 for (int line = 1; file.Peek () >= 0; line++) {
1004 ProcessDerivedCorePropLine (file.ReadLine ());
1005 } catch (Exception) {
1006 Console.Error.WriteLine ("**** At line " + line);
1013 void ProcessDerivedCorePropLine (string s)
1015 int idx = s.IndexOf ('#');
1017 s = s.Substring (0, idx);
1018 idx = s.IndexOf (';');
1021 string cpspec = s.Substring (0, idx);
1022 idx = cpspec.IndexOf ("..");
1023 NumberStyles nf = NumberStyles.HexNumber |
1024 NumberStyles.AllowTrailingWhite;
1025 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1026 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1027 string value = s.Substring (cpspec.Length + 1).Trim ();
1030 if (cp > char.MaxValue)
1035 for (int x = cp; x <= cpEnd; x++)
1036 isUppercase [x] = true;
1041 void ParseScripts (string filename)
1043 ArrayList cyrillic = new ArrayList ();
1044 ArrayList gurmukhi = new ArrayList ();
1045 ArrayList gujarati = new ArrayList ();
1046 ArrayList georgian = new ArrayList ();
1047 ArrayList thaana = new ArrayList ();
1049 using (StreamReader file =
1050 new StreamReader (filename)) {
1051 while (file.Peek () >= 0) {
1052 string s = file.ReadLine ();
1053 int idx = s.IndexOf ('#');
1055 s = s.Substring (0, idx);
1056 idx = s.IndexOf (';');
1060 string cpspec = s.Substring (0, idx);
1061 idx = cpspec.IndexOf ("..");
1062 NumberStyles nf = NumberStyles.HexNumber |
1063 NumberStyles.AllowTrailingWhite;
1064 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1065 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1066 string value = s.Substring (cpspec.Length + 1).Trim ();
1069 if (cp > char.MaxValue)
1074 for (int x = cp; x <= cpEnd; x++)
1075 if (!IsIgnorable (x))
1076 cyrillic.Add ((char) x);
1079 for (int x = cp; x <= cpEnd; x++)
1080 if (!IsIgnorable (x))
1081 gurmukhi.Add ((char) x);
1084 for (int x = cp; x <= cpEnd; x++)
1085 if (!IsIgnorable (x))
1086 gujarati.Add ((char) x);
1089 for (int x = cp; x <= cpEnd; x++)
1090 if (!IsIgnorable (x))
1091 georgian.Add ((char) x);
1094 for (int x = cp; x <= cpEnd; x++)
1095 if (!IsIgnorable (x))
1096 thaana.Add ((char) x);
1101 cyrillic.Sort (UCAComparer.Instance);
1102 gurmukhi.Sort (UCAComparer.Instance);
1103 gujarati.Sort (UCAComparer.Instance);
1104 georgian.Sort (UCAComparer.Instance);
1105 thaana.Sort (UCAComparer.Instance);
1106 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1107 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1108 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1109 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1110 orderedThaana = (char []) thaana.ToArray (typeof (char));
1113 void ParseJISOrder (string filename)
1115 using (StreamReader file =
1116 new StreamReader (filename)) {
1117 while (file.Peek () >= 0) {
1118 string s = file.ReadLine ();
1119 int idx = s.IndexOf ('#');
1121 s = s.Substring (0, idx).Trim ();
1124 idx = s.IndexOf (' ');
1127 // They start with "0x" so cut them out.
1128 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1129 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1130 jisJapanese.Add (new JISCharacter (cp, jis));
1135 void ParseCJK (string zhXML, string jaXML, string koXML)
1137 XmlDocument doc = new XmlDocument ();
1138 doc.XmlResolver = null;
1145 // Chinese Simplified
1148 offset = 0;//char.MaxValue - arr.Length;
1150 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1152 foreach (char c in s) {
1154 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1156 arr [(int) c - offset] = (ushort) v++;
1162 // Chinese Traditional
1165 offset = 0;//char.MaxValue - arr.Length;
1166 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1168 foreach (char c in s) {
1170 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1172 arr [(int) c - offset] = (ushort) v++;
1181 offset = 0;//char.MaxValue - arr.Length;
1183 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1185 foreach (char c in s) {
1187 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1189 arr [(int) c - offset] = (ushort) v++;
1196 // Korean weight is somewhat complex. It first shifts
1197 // Hangul category from 52-x to 80-x (they are anyways
1198 // computed). CJK ideographs are placed at secondary
1199 // weight, like XX YY 01 zz 01, where XX and YY are
1200 // corresponding "reset" value and zz is 41,43,45...
1202 // Unlike chs,cht and ja, Korean value is a combined
1203 // ushort which is computed as category
1207 offset = 0;//char.MaxValue - arr.Length;
1209 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1210 XmlElement sc = (XmlElement) reset.NextSibling;
1211 // compute "category" and "level 1" for the
1212 // target "reset" Hangle syllable
1213 char rc = reset.InnerText [0];
1214 int ri = ((int) rc - 0xAC00) + 1;
1216 ((ri / 254) * 256 + (ri % 254) + 2);
1217 // Place the characters after the target.
1220 foreach (char c in s) {
1221 arr [(int) c - offset] = p;
1222 cjkKOlv2 [(int) c - offset] = (byte) v;
1232 void FillIgnorables ()
1234 for (int i = 0; i <= char.MaxValue; i++) {
1235 if (Char.GetUnicodeCategory ((char) i) ==
1236 UnicodeCategory.OtherNotAssigned)
1238 if (IsIgnorable (i))
1239 ignorableFlags [i] |= 1;
1240 if (IsIgnorableSymbol (i))
1241 ignorableFlags [i] |= 2;
1242 if (IsIgnorableNonSpacing (i))
1243 ignorableFlags [i] |= 4;
1247 void ModifyParsedValues ()
1249 // number, secondary weights
1251 int [] numarr = numberSecondaryWeightBounds;
1252 for (int i = 0; i < numarr.Length; i += 2, weight++)
1253 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1254 if (Char.IsNumber ((char) cp))
1255 diacritical [cp] = weight;
1257 // Modify some decomposition equivalence
1258 decompType [0xFE31] = 0;
1259 decompIndex [0xFE31] = 0;
1260 decompLength [0xFE31] = 0;
1261 decompType [0xFE32] = 0;
1262 decompIndex [0xFE32] = 0;
1263 decompLength [0xFE32] = 0;
1265 // Korean parens numbers
1266 for (int i = 0x3200; i <= 0x321C; i++)
1267 diacritical [i] = 0xA;
1268 for (int i = 0x3260; i <= 0x327B; i++)
1269 diacritical [i] = 0xC;
1271 // Update name part of named characters
1272 for (int i = 0; i < sortableCharNames.Count; i++) {
1273 DictionaryEntry de =
1274 (DictionaryEntry) sortableCharNames [i];
1275 int cp = (int) de.Key;
1276 string renamed = null;
1278 case 0x2101: renamed = "A_1"; break;
1279 case 0x33C3: renamed = "A_2"; break;
1280 case 0x2105: renamed = "C_1"; break;
1281 case 0x2106: renamed = "C_2"; break;
1282 case 0x211E: renamed = "R1"; break;
1283 case 0x211F: renamed = "R2"; break;
1284 // Remove some of them!
1295 sortableCharNames.RemoveAt (i);
1299 if (renamed != null)
1300 sortableCharNames [i] =
1301 new DictionaryEntry (cp, renamed);
1305 void GenerateCore ()
1309 #region Specially ignored // 01
1310 // This will raise "Defined" flag up.
1311 foreach (char c in specialIgnore)
1312 map [(int) c] = new CharMapEntry (0, 0, 0);
1316 #region Variable weights
1317 // Controls : 06 03 - 06 3D
1319 for (int i = 0; i < 65536; i++) {
1320 if (IsIgnorable (i))
1323 uc = Char.GetUnicodeCategory (c);
1324 // NEL is whitespace but not ignored here.
1325 if (uc == UnicodeCategory.Control &&
1326 !Char.IsWhiteSpace (c) || c == '\u0085')
1327 AddCharMap (c, 6, 1);
1331 fillIndex [6] = 0x80;
1332 AddCharMapGroup ('\'', 6, 1, 0);
1333 AddCharMap ('\uFE63', 6, 1);
1335 // Hyphen/Dash : 06 81 - 06 90
1336 for (int i = 0; i < char.MaxValue; i++) {
1337 if (!IsIgnorable (i) &&
1338 Char.GetUnicodeCategory ((char) i) ==
1339 UnicodeCategory.DashPunctuation) {
1340 AddCharMapGroup2 ((char) i, 6, 1, 0);
1342 // SPECIAL: add 2027 and 2043
1343 // Maybe they are regarded the
1344 // same hyphens in "central"
1346 AddCharMap ('\u2027', 6, 1);
1347 AddCharMap ('\u2043', 6, 1);
1352 // Arabic variable weight chars 06 A0 -
1353 fillIndex [6] = 0xA0;
1355 for (int i = 0x64B; i <= 0x650; i++)
1356 AddArabicCharMap ((char) i);
1358 AddCharMapGroup ('\u0652', 6, 1, 0);
1360 AddCharMapGroup ('\u0651', 6, 1, 0);
1364 #region Nonspacing marks // 01
1365 // FIXME: 01 03 - 01 B6 ... annoyance :(
1367 // Combining diacritical marks: 01 DC -
1369 fillIndex [0x1] = 0x41;
1370 for (int i = 0x030E; i <= 0x0326; i++)
1371 if (!IsIgnorable (i))
1372 AddCharMap ((char) i, 0x1, 1);
1373 for (int i = 0x0329; i <= 0x0334; i++)
1374 if (!IsIgnorable (i))
1375 AddCharMap ((char) i, 0x1, 1);
1376 for (int i = 0x0339; i <= 0x0341; i++)
1377 if (!IsIgnorable (i))
1378 AddCharMap ((char) i, 0x1, 1);
1379 fillIndex [0x1] = 0x72;
1380 for (int i = 0x0346; i <= 0x0348; i++)
1381 if (!IsIgnorable (i))
1382 AddCharMap ((char) i, 0x1, 1);
1383 for (int i = 0x02BE; i <= 0x02BF; i++)
1384 if (!IsIgnorable (i))
1385 AddCharMap ((char) i, 0x1, 1);
1386 for (int i = 0x02C1; i <= 0x02C5; i++)
1387 if (!IsIgnorable (i))
1388 AddCharMap ((char) i, 0x1, 1);
1389 for (int i = 0x02CE; i <= 0x02CF; i++)
1390 if (!IsIgnorable (i))
1391 AddCharMap ((char) i, 0x1, 1);
1392 for (int i = 0x02D1; i <= 0x02D3; i++)
1393 if (!IsIgnorable (i))
1394 AddCharMap ((char) i, 0x1, 1);
1395 AddCharMap ('\u02DE', 0x1, 1);
1396 for (int i = 0x02E4; i <= 0x02E9; i++)
1397 if (!IsIgnorable (i))
1398 AddCharMap ((char) i, 0x1, 1);
1400 // LAMESPEC: It should not stop at '\u20E1'. There are
1401 // a few more characters (that however results in
1402 // overflow of level 2 unless we start before 0xDD).
1403 fillIndex [0x1] = 0xDC;
1404 for (int i = 0x20d0; i <= 0x20e1; i++)
1405 AddCharMap ((char) i, 0x1, 1);
1409 #region Whitespaces // 07 03 -
1410 fillIndex [0x7] = 0x2;
1411 AddCharMap (' ', 0x7, 2);
1412 AddCharMap ('\u00A0', 0x7, 1);
1413 for (int i = 9; i <= 0xD; i++)
1414 AddCharMap ((char) i, 0x7, 1);
1415 for (int i = 0x2000; i <= 0x200B; i++)
1416 AddCharMap ((char) i, 0x7, 1);
1418 fillIndex [0x7] = 0x17;
1419 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1420 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1422 // Characters which used to represent layout control.
1423 // LAMESPEC: Windows developers seem to have thought
1424 // that those characters are kind of whitespaces,
1425 // while they aren't.
1426 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1427 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1430 // FIXME: 09 should be more complete.
1431 fillIndex [0x9] = 2;
1433 for (int cp = 0x2300; cp <= 0x237A; cp++)
1434 AddCharMap ((char) cp, 0x9, 1, 0);
1437 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1438 foreach (DictionaryEntry de in arrowValues) {
1439 int idx = (int) de.Value;
1440 int cp = (int) de.Key;
1441 if (map [cp].Defined)
1443 fillIndex [0x9] = (byte) (0xD8 + idx);
1444 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1448 byte [] boxLv2 = new byte [128];
1449 for (int i = 0; i < boxLv2.Length; i++)
1451 foreach (DictionaryEntry de in boxValues) {
1452 int cp = (int) de.Key;
1453 int idx = (int) de.Value;
1454 if (map [cp].Defined)
1456 fillIndex [0x9] = (byte) (0xE5 + idx);
1457 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1460 // Some special characters (slanted)
1461 fillIndex [0x9] = 0xF4;
1462 AddCharMap ('\u2571', 0x9, 3);
1463 AddCharMap ('\u2572', 0x9, 3);
1464 AddCharMap ('\u2573', 0x9, 3);
1466 // FIXME: implement 0A
1468 fillIndex [0xA] = 2;
1469 // byte currency symbols
1470 for (int cp = 0; cp < 0x100; cp++) {
1471 uc = Char.GetUnicodeCategory ((char) cp);
1472 if (!IsIgnorable (cp) &&
1473 uc == UnicodeCategory.CurrencySymbol &&
1475 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1477 // byte other symbols
1478 for (int cp = 0; cp < 0x100; cp++) {
1480 continue; // SPECIAL: skip FIXME: why?
1481 uc = Char.GetUnicodeCategory ((char) cp);
1482 if (!IsIgnorable (cp) &&
1483 uc == UnicodeCategory.OtherSymbol)
1484 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1487 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1488 for (int cp = 0x2600; cp <= 0x2613; cp++)
1489 AddCharMap ((char) cp, 0xA, 1, 0);
1491 for (int cp = 0x2620; cp <= 0x2770; cp++)
1492 if (Char.IsSymbol ((char) cp))
1493 AddCharMap ((char) cp, 0xA, 1, 0);
1495 for (int i = 0x2440; i < 0x2460; i++)
1496 AddCharMap ((char) i, 0xA, 1, 0);
1500 #region Numbers // 0C 02 - 0C E1
1501 fillIndex [0xC] = 2;
1503 // 9F8 : Bengali "one less than the denominator"
1504 AddCharMap ('\u09F8', 0xC, 1);
1506 ArrayList numbers = new ArrayList ();
1507 for (int i = 0; i < 65536; i++)
1508 if (!IsIgnorable (i) &&
1509 Char.IsNumber ((char) i) &&
1510 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1513 ArrayList numberValues = new ArrayList ();
1514 foreach (int i in numbers)
1515 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1516 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1518 //foreach (DictionaryEntry de in numberValues)
1519 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1521 decimal prevValue = -1;
1522 foreach (DictionaryEntry de in numberValues) {
1523 int cp = (int) de.Key;
1524 decimal currValue = (decimal) de.Value;
1525 bool addnew = false;
1526 if (prevValue < currValue &&
1527 prevValue - (int) prevValue == 0 &&
1531 // Process Hangzhou and Roman numbers
1533 // There are some SPECIAL cases.
1534 if (currValue != 4) // no increment for 4
1538 xcp = (int) prevValue + 0x2170 - 1;
1539 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1540 xcp = (int) prevValue + 0x2160 - 1;
1541 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1542 fillIndex [0xC] += 2;
1543 xcp = (int) prevValue + 0x3021 - 1;
1544 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1547 if (prevValue < currValue)
1548 prevValue = currValue;
1549 if (map [cp].Defined)
1551 // HangZhou and Roman are add later
1553 else if (0x3021 <= cp && cp < 0x302A
1554 || 0x2160 <= cp && cp < 0x216A
1555 || 0x2170 <= cp && cp < 0x217A)
1558 if (cp == 0x215B) // FIXME: why?
1559 fillIndex [0xC] += 2;
1560 else if (cp == 0x3021) // FIXME: why?
1562 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1564 if (addnew || cp <= '9') {
1566 if (1 <= currValue && currValue <= 10) {
1567 xcp = cp - 0x31 + 0x2776;
1568 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1569 xcp = cp - 0x31 + 0x2780;
1570 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1571 xcp = cp - 0x31 + 0x278A;
1572 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1574 if (1 <= currValue && currValue <= 20) {
1575 xcp = cp - 0x31 + 0x2460;
1576 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1577 xcp = cp - 0x31 + 0x2474;
1578 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1579 xcp = cp - 0x31 + 0x2488;
1580 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1584 if (cp != 0x09E7 && cp != 0x09EA)
1587 // Add special cases that are not regarded as
1588 // numbers in UnicodeCategory speak.
1591 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1592 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1594 else if (cp == '6') // FIXME: why?
1599 fillIndex [0xC] = 0xFF;
1600 AddCharMap ('\u221E', 0xC, 1);
1603 #region Letters and NonSpacing Marks (general)
1605 // ASCII Latin alphabets
1606 for (int i = 0; i < alphabets.Length; i++)
1607 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1610 // non-ASCII Latin alphabets
1611 // FIXME: there is no such characters that are placed
1612 // *after* "alphabets" array items. This is nothing
1613 // more than a hack that creates dummy weight for
1614 // primary characters.
1615 for (int i = 0x0080; i < 0x0300; i++) {
1616 if (!Char.IsLetter ((char) i))
1618 // For those Latin Letters which has NFKD are
1619 // not added as independent primary character.
1620 if (decompIndex [i] != 0)
1623 // 1.some alphabets have primarily
1624 // equivalent ASCII alphabets.
1625 // 2.some have independent primary weights,
1626 // but inside a-to-z range.
1627 // 3.there are some expanded characters that
1628 // are not part of Unicode Standard NFKD.
1630 // 1. skipping them does not make sense
1631 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1632 // case 0x184: case 0x185: case 0x186: case 0x189:
1633 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1634 // case 0x194: case 0x195: case 0x196: case 0x19A:
1635 // case 0x19B: case 0x19C:
1636 // 2. skipping them does not make sense
1637 // case 0x14A: // Ng
1638 // case 0x14B: // ng
1642 case 0xDE: // Icelandic Thorn
1643 case 0xFE: // Icelandic Thorn
1644 case 0xDF: // German ss
1645 case 0xFF: // German ss
1646 // not classified yet
1647 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1648 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1649 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1650 // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1654 AddCharMapGroup ((char) i, 0xE, 1, 0);
1658 fillIndex [0xF] = 02;
1659 for (int i = 0x0380; i < 0x0390; i++)
1660 if (Char.IsLetter ((char) i))
1661 AddLetterMap ((char) i, 0xF, 1);
1662 fillIndex [0xF] = 02;
1663 for (int i = 0x0391; i < 0x03CF; i++)
1664 if (Char.IsLetter ((char) i))
1665 AddLetterMap ((char) i, 0xF, 1);
1666 fillIndex [0xF] = 0x40;
1667 for (int i = 0x03D0; i < 0x0400; i++)
1668 if (Char.IsLetter ((char) i))
1669 AddLetterMap ((char) i, 0xF, 1);
1671 // Cyrillic - UCA order w/ some modification
1672 fillIndex [0x10] = 0x3;
1673 // table which is moslty from UCA DUCET.
1674 for (int i = 0; i < orderedCyrillic.Length; i++) {
1675 char c = orderedCyrillic [i];
1676 if (Char.IsLetter (c))
1677 AddLetterMap (c, 0x10, 3);
1679 for (int i = 0x0460; i < 0x0481; i++) {
1680 if (Char.IsLetter ((char) i))
1681 AddLetterMap ((char) i, 0x10, 3);
1685 fillIndex [0x11] = 0x3;
1686 for (int i = 0x0531; i < 0x0586; i++)
1687 if (Char.IsLetter ((char) i))
1688 AddLetterMap ((char) i, 0x11, 1);
1692 fillIndex [0x12] = 0x3;
1693 for (int i = 0x05D0; i < 0x05FF; i++)
1694 if (Char.IsLetter ((char) i))
1695 AddLetterMap ((char) i, 0x12, 1);
1697 fillIndex [0x1] = 0x3;
1698 for (int i = 0x0591; i <= 0x05C2; i++)
1700 AddCharMap ((char) i, 0x1, 1);
1703 fillIndex [0x1] = 0x8E;
1704 fillIndex [0x13] = 0x3;
1705 for (int i = 0x0621; i <= 0x064A; i++) {
1707 if (Char.GetUnicodeCategory ((char) i)
1708 != UnicodeCategory.OtherLetter) {
1709 // FIXME: arabic nonspacing marks are
1710 // in different order.
1711 AddCharMap ((char) i, 0x1, 1);
1714 // map [i] = new CharMapEntry (0x13,
1715 // (byte) arabicLetterPrimaryValues [i], 1);
1717 (byte) arabicLetterPrimaryValues [i];
1718 AddLetterMap ((char) i, 0x13, 0);
1720 fillIndex [0x13] = 0x84;
1721 for (int i = 0x0674; i < 0x06D6; i++)
1722 if (Char.IsLetter ((char) i))
1723 AddLetterMap ((char) i, 0x13, 1);
1726 // FIXME: it does seem straight codepoint mapping.
1727 fillIndex [0x14] = 04;
1728 for (int i = 0x0901; i < 0x0905; i++)
1729 if (!IsIgnorable (i))
1730 AddLetterMap ((char) i, 0x14, 2);
1731 fillIndex [0x14] = 0xB;
1732 for (int i = 0x0905; i < 0x093A; i++)
1733 if (Char.IsLetter ((char) i))
1734 AddLetterMap ((char) i, 0x14, 4);
1735 for (int i = 0x093E; i < 0x094F; i++)
1736 if (!IsIgnorable (i))
1737 AddLetterMap ((char) i, 0x14, 2);
1741 fillIndex [0x15] = 02;
1742 for (int i = 0x0980; i < 0x9FF; i++) {
1743 if (IsIgnorable (i))
1746 fillIndex [0x15] = 0x3B;
1747 switch (Char.GetUnicodeCategory ((char) i)) {
1748 case UnicodeCategory.NonSpacingMark:
1749 case UnicodeCategory.DecimalDigitNumber:
1750 case UnicodeCategory.OtherNumber:
1753 AddLetterMap ((char) i, 0x15, 1);
1756 fillIndex [0x1] = 0x3;
1757 for (int i = 0x0981; i < 0x0A00; i++)
1758 if (Char.GetUnicodeCategory ((char) i) ==
1759 UnicodeCategory.NonSpacingMark)
1760 AddCharMap ((char) i, 0x1, 1);
1762 // Gurmukhi. orderedGurmukhi is from UCA
1763 // FIXME: it does not look equivalent to UCA.
1764 fillIndex [0x1] = 03;
1765 fillIndex [0x16] = 02;
1766 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1767 char c = orderedGurmukhi [i];
1768 if (IsIgnorable ((int) c))
1770 if (!Char.IsLetter (c)) {
1771 AddLetterMap (c, 0x1, 1);
1774 if (c == '\u0A3C' || c == '\u0A4D' ||
1775 '\u0A66' <= c && c <= '\u0A71')
1777 AddLetterMap (c, 0x16, 4);
1780 // Gujarati. orderedGujarati is from UCA
1781 fillIndex [0x17] = 02;
1782 for (int i = 0; i < orderedGujarati.Length; i++)
1783 AddLetterMap (orderedGujarati [i], 0x17, 4);
1786 fillIndex [0x18] = 02;
1787 for (int i = 0x0B00; i < 0x0B7F; i++) {
1788 switch (Char.GetUnicodeCategory ((char) i)) {
1789 case UnicodeCategory.NonSpacingMark:
1790 case UnicodeCategory.DecimalDigitNumber:
1793 AddLetterMap ((char) i, 0x18, 1);
1797 fillIndex [0x19] = 2;
1798 AddCharMap ('\u0BD7', 0x19, 0);
1799 fillIndex [0x19] = 0xA;
1801 for (int i = 0x0BD7; i < 0x0B94; i++)
1802 if (Char.IsLetter ((char) i))
1803 AddCharMap ((char) i, 0x19, 2);
1805 fillIndex [0x19] = 0x24;
1806 AddCharMap ('\u0B94', 0x19, 0);
1807 fillIndex [0x19] = 0x26;
1808 // The array for Tamil consonants is a constant.
1809 // Windows have almost similar sequence to TAM from
1810 // tamilnet but a bit different in Grantha.
1811 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1812 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1814 fillIndex [0x19] = 0x82;
1815 for (int i = 0x0BBE; i < 0x0BCD; i++)
1816 if (Char.GetUnicodeCategory ((char) i) ==
1817 UnicodeCategory.SpacingCombiningMark
1819 AddLetterMap ((char) i, 0x19, 2);
1822 fillIndex [0x1A] = 0x4;
1823 for (int i = 0x0C00; i < 0x0C62; i++) {
1824 if (i == 0x0C55 || i == 0x0C56)
1826 AddCharMap ((char) i, 0x1A, 3);
1827 char supp = (i == 0x0C0B) ? '\u0C60':
1828 i == 0x0C0C ? '\u0C61' : char.MinValue;
1829 if (supp == char.MinValue)
1831 AddCharMap (supp, 0x1A, 3);
1835 fillIndex [0x1B] = 4;
1836 for (int i = 0x0C80; i < 0x0CE5; i++) {
1837 if (i == 0x0CD5 || i == 0x0CD6)
1839 AddCharMap ((char) i, 0x1B, 3);
1843 fillIndex [0x1C] = 2;
1844 for (int i = 0x0D02; i < 0x0D61; i++)
1845 // FIXME: I avoided MSCompatUnicodeTable usage
1846 // here (it results in recursion). So check if
1847 // using NonSpacingMark makes sense or not.
1848 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1849 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1850 AddCharMap ((char) i, 0x1C, 1);
1852 // Thai ... note that it breaks 0x1E wall after E2B!
1853 // Also, all Thai characters have level 2 value 3.
1854 fillIndex [0x1E] = 2;
1855 for (int i = 0xE44; i < 0xE48; i++)
1856 AddCharMap ((char) i, 0x1E, 1, 3);
1857 for (int i = 0xE01; i < 0xE2B; i++)
1858 AddCharMap ((char) i, 0x1E, 6, 0);
1859 fillIndex [0x1F] = 5;
1860 for (int i = 0xE2B; i < 0xE30; i++)
1861 AddCharMap ((char) i, 0x1F, 6, 0);
1862 for (int i = 0xE30; i < 0xE3B; i++)
1863 AddCharMap ((char) i, 0x1F, 1, 3);
1864 // some Thai characters remains.
1865 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1866 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1867 foreach (char c in specialThai)
1868 AddCharMap (c, 0x1F, 1);
1871 fillIndex [0x1F] = 2;
1872 for (int i = 0xE80; i < 0xEDF; i++)
1873 if (Char.IsLetter ((char) i))
1874 AddCharMap ((char) i, 0x1F, 1);
1876 // Georgian. orderedGeorgian is from UCA DUCET.
1877 fillIndex [0x21] = 5;
1878 for (int i = 0; i < orderedGeorgian.Length; i++)
1879 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1882 fillIndex [0x22] = 2;
1883 int kanaOffset = 0x3041;
1884 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1886 for (int gyo = 0; gyo < 9; gyo++) {
1887 for (int dan = 0; dan < 5; dan++) {
1888 if (gyo == 7 && dan % 2 == 1) {
1891 kanaOffset -= 2; // There is no space for yi and ye.
1894 int cp = kanaOffset + dan * kanaLines [gyo];
1895 // small lines (a-gyo, ya-gyo)
1896 if (gyo == 0 || gyo == 7) {
1897 AddKanaMap (cp, 1); // small
1898 AddKanaMap (cp + 1, 1);
1901 AddKanaMap (cp, kanaLines [gyo]);
1905 // add small 'Tsu' (before normal one)
1906 AddKanaMap (0x3063, 1);
1910 fillIndex [0x22] += 3;
1911 kanaOffset += 5 * kanaLines [gyo];
1914 // Wa-gyo is almost special, so I just manually add.
1915 AddLetterMap ((char) 0x308E, 0x22, 0);
1916 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1917 AddLetterMap ((char) 0x308F, 0x22, 0);
1918 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1920 AddLetterMap ((char) 0x3090, 0x22, 0);
1921 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1922 fillIndex [0x22] += 2;
1923 // no "Wu" in Japanese.
1924 AddLetterMap ((char) 0x3091, 0x22, 0);
1925 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1927 AddLetterMap ((char) 0x3092, 0x22, 0);
1928 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1930 fillIndex [0x22] = 0x80;
1931 AddLetterMap ((char) 0x3093, 0x22, 0);
1932 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1934 // JIS Japanese square chars.
1935 fillIndex [0x22] = 0x97;
1936 jisJapanese.Sort (JISComparer.Instance);
1937 foreach (JISCharacter j in jisJapanese)
1938 AddCharMap ((char) j.CP, 0x22, 1);
1939 // non-JIS Japanese square chars.
1940 nonJisJapanese.Sort (NonJISComparer.Instance);
1941 foreach (NonJISCharacter j in nonJisJapanese)
1942 AddCharMap ((char) j.CP, 0x22, 1);
1945 fillIndex [0x23] = 0x02;
1946 for (int i = 0x3105; i <= 0x312C; i++)
1947 AddCharMap ((char) i, 0x23, 1);
1949 // Estrangela: ancient Syriac
1950 fillIndex [0x24] = 0x0B;
1951 // FIXME: is 0x71E really alternative form?
1952 ArrayList syriacAlternatives = new ArrayList (
1953 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1954 for (int i = 0x0710; i <= 0x072C; i++) {
1955 if (i == 0x0711) // NonSpacingMark
1957 if (syriacAlternatives.Contains (i))
1959 AddCharMap ((char) i, 0x24, 4);
1964 foreach (int cp in syriacAlternatives)
1965 map [cp] = new CharMapEntry (0x24,
1966 (byte) (map [cp - 1].Level1 + 2),
1970 // FIXME: it turned out that it does not look like UCA
1971 fillIndex [0x24] = 0x6E;
1972 for (int i = 0; i < orderedThaana.Length; i++) {
1973 if (IsIgnorableNonSpacing (i))
1975 AddCharMap (orderedThaana [i], 0x24, 2);
1979 // FIXME: Add more culture-specific letters (that are
1980 // not supported in Windows collation) here.
1982 // Surrogate ... they are computed.
1987 // Unlike UCA Windows Hangul sequence mixes Jongseong
1988 // with Choseong sequence as well as Jungseong,
1989 // adjusted to have the same primary weight for the
1990 // same base character. So it is impossible to compute
1993 // Here I introduce an ordered sequence of mixed
1994 // 'commands' and 'characters' that is similar to
1996 // - ',' increases primary weight.
1997 // - [A B] means a range, increasing index
1998 // - {A B} means a range, without increasing index
1999 // - '=' is no operation (it means the characters
2000 // of both sides have the same weight).
2001 // - '>' inserts a Hangul Syllable block that
2002 // contains 0x251 characters.
2003 // - '<' decreases the index
2004 // - '0'-'9' means skip count
2005 // - whitespaces are ignored
2008 string hangulSequence =
2009 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2010 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2011 + "<{\u1113 \u1116}, \u3165,"
2012 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2013 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2014 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2015 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2016 + "[\u11D1 \u11D2], \u11B2,"
2017 + "[\u11D3 \u11D5], \u11B3,"
2018 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2019 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2020 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2021 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2022 + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
2023 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
2024 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
2025 + "\u11EA,, \u110A=\u11BB,,, >"
2026 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2027 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2028 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2029 + "\u11F1,, \u11F2,,,"
2030 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2031 + "<\u114D, \u110D,, >"
2032 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2033 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2034 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2035 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2036 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2040 byte hangulCat = 0x52;
2041 fillIndex [hangulCat] = 0x2;
2043 int syllableBlock = 0;
2044 for (int n = 0; n < hangulSequence.Length; n++) {
2045 char c = hangulSequence [n];
2047 if (Char.IsWhiteSpace (c))
2053 IncrementSequentialIndex (ref hangulCat);
2056 if (fillIndex [hangulCat] == 2)
2057 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2058 fillIndex [hangulCat]--;
2061 IncrementSequentialIndex (ref hangulCat);
2062 for (int l = 0; l < 0x15; l++)
2063 for (int v = 0; v < 0x1C; v++) {
2065 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2066 IncrementSequentialIndex (ref hangulCat);
2071 start = hangulSequence [n + 1];
2072 end = hangulSequence [n + 3];
2073 for (int i = start; i <= end; i++) {
2074 AddCharMap ((char) i, hangulCat, 0);
2076 IncrementSequentialIndex (ref hangulCat);
2078 n += 4; // consumes 5 characters for this operation
2081 start = hangulSequence [n + 1];
2082 end = hangulSequence [n + 3];
2083 for (int i = start; i <= end; i++)
2084 AddCharMap ((char) i, hangulCat, 0);
2085 n += 4; // consumes 5 characters for this operation
2088 AddCharMap (c, hangulCat, 0);
2095 // Letterlike characters and CJK compatibility square
2096 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2097 int [] counts = new int ['Z' - 'A' + 1];
2098 char [] namedChars = new char [sortableCharNames.Count];
2100 foreach (DictionaryEntry de in sortableCharNames) {
2101 counts [((string) de.Value) [0] - 'A']++;
2102 namedChars [nCharNames++] = (char) ((int) de.Key);
2104 nCharNames = 0; // reset
2105 for (int a = 0; a < counts.Length; a++) {
2106 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2107 for (int i = 0; i < counts [a]; i++)
2108 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2109 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2112 // CJK unified ideograph.
2114 fillIndex [cjkCat] = 0x2;
2115 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2116 if (!IsIgnorable (cp))
2117 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2118 // CJK Extensions goes here.
2119 // LAMESPEC: With this Windows style CJK layout, it is
2120 // impossible to add more CJK ideograph i.e. 0x9FA6-
2121 // 0x9FBB can never be added w/o breaking compat.
2122 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2123 if (!IsIgnorable (cp))
2124 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2126 // PrivateUse ... computed.
2127 // remaining Surrogate ... computed.
2129 #region Special "biggest" area (FF FF)
2130 fillIndex [0xFF] = 0xFF;
2131 char [] specialBiggest = new char [] {
2132 '\u3005', '\u3031', '\u3032', '\u309D',
2133 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2134 '\uFE7C', '\uFE7D', '\uFF70'};
2135 foreach (char c in specialBiggest)
2136 AddCharMap (c, 0xFF, 0);
2139 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2140 // non-alphanumeric ASCII except for: + - < = > '
2141 for (int i = 0x21; i < 0x7F; i++) {
2142 if (Char.IsLetterOrDigit ((char) i)
2143 || "+-<=>'".IndexOf ((char) i) >= 0)
2144 continue; // they are not added here.
2145 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2146 // Insert 3001 after ',' and 3002 after '.'
2148 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2149 else if (i == 0x2E) {
2151 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2154 AddCharMap ('\uFE30', 0x7, 1, 0);
2158 #region 07 - Punctuations and something else
2159 for (int i = 0xA0; i < char.MaxValue; i++) {
2160 if (IsIgnorable (i))
2172 switch (Char.GetUnicodeCategory ((char) i)) {
2173 case UnicodeCategory.OtherPunctuation:
2174 case UnicodeCategory.ClosePunctuation:
2175 case UnicodeCategory.OpenPunctuation:
2176 case UnicodeCategory.InitialQuotePunctuation:
2177 case UnicodeCategory.FinalQuotePunctuation:
2178 case UnicodeCategory.ModifierSymbol:
2179 // SPECIAL CASES: // 0xA
2180 if (0x2020 <= i && i <= 0x2042)
2182 AddCharMapGroup ((char) i, 0x7, 1, 0);
2185 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2186 goto case UnicodeCategory.OtherPunctuation;
2191 for (int i = 0x2400; i <= 0x2421; i++)
2192 AddCharMap ((char) i, 0x7, 1, 0);
2195 // FIXME: for 07 xx we need more love.
2197 // FIXME: 08 should be more complete.
2198 fillIndex [0x8] = 2;
2199 for (int cp = 0; cp < char.MaxValue; cp++)
2200 if (!map [cp].Defined &&
2201 Char.GetUnicodeCategory ((char) cp) ==
2202 UnicodeCategory.MathSymbol)
2203 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2205 // Characters w/ diacritical marks (NFKD)
2206 for (int i = 0; i <= char.MaxValue; i++) {
2207 if (map [i].Defined || IsIgnorable (i))
2209 if (decompIndex [i] == 0)
2212 int start = decompIndex [i];
2213 int primaryChar = decompValues [start];
2216 int length = decompLength [i];
2217 // special processing for parenthesized ones.
2219 decompValues [start] == '(' &&
2220 decompValues [start + 2] == ')') {
2221 primaryChar = decompValues [start + 1];
2225 if (map [primaryChar].Level1 == 0)
2228 for (int l = 1; l < length; l++) {
2229 int c = decompValues [start + l];
2230 if (map [c].Level1 != 0)
2232 secondary += diacritical [c];
2236 map [i] = new CharMapEntry (
2237 map [primaryChar].Category,
2238 map [primaryChar].Level1,
2243 #region Level2 adjustment
2245 diacritical [0x624] = 0x5;
2246 diacritical [0x626] = 0x7;
2247 diacritical [0x622] = 0x9;
2248 diacritical [0x623] = 0xA;
2249 diacritical [0x625] = 0xB;
2250 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2251 diacritical [0x64A] = 0x7; // Yaa'
2254 for (int i = 0; i < char.MaxValue; i++) {
2256 byte cat = map [i].Category;
2258 case 0xE: // Latin diacritics
2259 case 0x22: // Japanese: circled characters
2260 mod = diacritical [i];
2262 case 0x13: // Arabic
2263 if (diacritical [i] == 0)
2264 mod = 0x8; // default for arabic
2267 if (0x52 <= cat && cat <= 0x7F) // Hangul
2268 mod = diacritical [i];
2270 map [i] = new CharMapEntry (
2271 cat, map [i].Level1, mod);
2275 // FIXME: this is hack but those which are
2276 // NonSpacingMark characters and still undefined
2277 // are likely to be nonspacing.
2278 for (int i = 0; i < char.MaxValue; i++)
2279 if (!map [i].Defined &&
2281 Char.GetUnicodeCategory ((char) i) ==
2282 UnicodeCategory.NonSpacingMark)
2283 AddCharMap ((char) i, 1, 1);
2286 private void IncrementSequentialIndex (ref byte hangulCat)
2288 fillIndex [hangulCat]++;
2289 if (fillIndex [hangulCat] == 0) { // overflown
2291 fillIndex [hangulCat] = 0x2;
2295 // Reset fillIndex to fixed value and call AddLetterMap().
2296 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2298 fillIndex [category] = alphaWeight;
2299 AddLetterMap (c, category, 0);
2301 ArrayList al = latinMap [c] as ArrayList;
2305 foreach (int cp in al)
2306 AddLetterMap ((char) cp, category, 0);
2309 private void AddKanaMap (int i, byte voices)
2311 for (byte b = 0; b < voices; b++) {
2312 char c = (char) (i + b);
2313 byte arg = (byte) (b > 0 ? b + 2 : 0);
2315 AddLetterMapCore (c, 0x22, 0, arg);
2317 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2321 private void AddLetterMap (char c, byte category, byte updateCount)
2323 AddLetterMapCore (c, category, updateCount, 0);
2326 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2329 // <small> updates index
2330 c2 = ToSmallForm (c);
2332 AddCharMapGroup (c2, category, updateCount, level2);
2333 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2334 if (c2 != c && !map [(int) c2].Defined)
2335 AddLetterMapCore (c2, category, 0, level2);
2336 bool doUpdate = true;
2337 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2340 AddCharMapGroup (c, category, 0, level2);
2342 fillIndex [category] += updateCount;
2345 private bool AddCharMap (char c, byte category, byte increment)
2347 return AddCharMap (c, category, increment, 0);
2350 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2352 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2353 return false; // do nothing
2354 map [(int) c] = new CharMapEntry (category,
2355 category == 1 ? alt : fillIndex [category],
2356 category == 1 ? fillIndex [category] : alt);
2357 fillIndex [category] += increment;
2361 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2363 char c2 = ToSmallFormTail (c);
2365 AddCharMap (c2, category, updateCount, 0);
2367 AddCharMap (c, category, updateCount, 0);
2369 c2 = ToFullWidthTail (c);
2371 AddCharMapGroupTail (c2, category, updateCount);
2375 // Adds characters to table in the order below
2376 // (+ increases weight):
2380 // <full> | <super> | <sub>
2381 // <circle> | <wide> (| <narrow>)
2385 // level2 is fixed (does not increase).
2386 int [] sameWeightItems = new int [] {
2387 DecompositionFraction,
2391 DecompositionCircle,
2393 DecompositionNarrow,
2395 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2397 if (map [(int) c].Defined)
2400 char small = char.MinValue;
2401 char vertical = char.MinValue;
2402 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2404 object smv = nfkd [(byte) DecompositionSmall];
2406 small = (char) ((int) smv);
2407 object vv = nfkd [(byte) DecompositionVertical];
2409 vertical = (char) ((int) vv);
2412 // <small> updates index
2413 if (small != char.MinValue)
2414 AddCharMap (small, category, updateCount);
2417 AddCharMap (c, category, 0, level2);
2420 foreach (int weight in sameWeightItems) {
2421 object wv = nfkd [(byte) weight];
2423 AddCharMap ((char) ((int) wv), category, 0, level2);
2427 // update index here.
2428 fillIndex [category] += updateCount;
2430 if (vertical != char.MinValue)
2431 AddCharMap (vertical, category, updateCount, level2);
2434 private void AddCharMapCJK (char c, ref byte category)
2436 AddCharMap (c, category, 0, 0);
2437 IncrementSequentialIndex (ref category);
2439 // Special. I wonder why but Windows skips 9E F9.
2440 if (category == 0x9E && fillIndex [category] == 0xF9)
2441 IncrementSequentialIndex (ref category);
2444 private void AddCharMapGroupCJK (char c, ref byte category)
2446 AddCharMapCJK (c, ref category);
2448 // LAMESPEC: see below.
2449 if (c == '\u52DE') {
2450 AddCharMapCJK ('\u3298', ref category);
2451 AddCharMapCJK ('\u3238', ref category);
2454 AddCharMapCJK ('\u32A2', ref category);
2456 // Especially this mapping order totally does
2457 // not make sense to me.
2458 AddCharMapCJK ('\u32A9', ref category);
2460 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2463 for (byte weight = 0; weight <= 0x12; weight++) {
2464 object wv = nfkd [weight];
2469 // Special: they are ignored in this area.
2470 // FIXME: check if it is sane
2471 if (0xF900 <= w && w <= 0xFAD9)
2473 // LAMESPEC: on Windows some of CJK characters
2474 // in 3200-32B0 are incorrectly mapped. They
2475 // mix Chinise and Japanese Kanji when
2476 // ordering those characters.
2478 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2482 AddCharMapCJK ((char) w, ref category);
2486 // For now it is only for 0x7 category.
2487 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2489 char small = char.MinValue;
2490 char vertical = char.MinValue;
2491 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2493 object smv = nfkd [(byte) DecompositionSmall];
2495 small = (char) ((int) smv);
2496 object vv = nfkd [(byte) DecompositionVertical];
2498 vertical = (char) ((int) vv);
2501 // <small> updates index
2502 if (small != char.MinValue)
2503 // SPECIAL CASE excluded (FIXME: why?)
2504 if (small != '\u2024')
2505 AddCharMap (small, category, updateCount);
2508 AddCharMap (c, category, updateCount, level2);
2510 // Since nfkdMap is problematic to have two or more
2511 // NFKD to an identical character, here I iterate all.
2512 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2513 if (decompLength [c2] == 1 &&
2514 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2515 switch (decompType [c2]) {
2516 case DecompositionCompat:
2517 AddCharMap ((char) c2, category, updateCount, level2);
2523 if (vertical != char.MinValue)
2524 // SPECIAL CASE excluded (FIXME: why?)
2525 if (vertical != '\uFE33' && vertical != '\uFE34')
2526 AddCharMap (vertical, category, updateCount, level2);
2529 private void AddArabicCharMap (char c)
2532 byte updateCount = 1;
2536 AddCharMap (c, category, 0, level2);
2538 // Since nfkdMap is problematic to have two or more
2539 // NFKD to an identical character, here I iterate all.
2540 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2541 if (decompLength [c2] == 0)
2543 int idx = decompIndex [c2] + decompLength [c2] - 1;
2544 if ((int) (decompValues [idx]) == (int) c)
2545 AddCharMap ((char) c2, category,
2548 fillIndex [category] += updateCount;
2551 char ToFullWidth (char c)
2553 return ToDecomposed (c, DecompositionFull, false);
2556 char ToFullWidthTail (char c)
2558 return ToDecomposed (c, DecompositionFull, true);
2561 char ToSmallForm (char c)
2563 return ToDecomposed (c, DecompositionSmall, false);
2566 char ToSmallFormTail (char c)
2568 return ToDecomposed (c, DecompositionSmall, true);
2571 char ToDecomposed (char c, byte d, bool tail)
2573 if (decompType [(int) c] != d)
2575 int idx = decompIndex [(int) c];
2577 idx += decompLength [(int) c] - 1;
2578 return (char) decompValues [idx];
2581 bool ExistsJIS (int cp)
2583 foreach (JISCharacter j in jisJapanese)
2591 #region Level 3 properties (Case/Width)
2593 private byte ComputeLevel3Weight (char c)
2595 byte b = ComputeLevel3WeightRaw (c);
2596 return b > 0 ? (byte) (b + 2) : b;
2599 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2602 if ('\u11A8' <= c && c <= '\u11F9')
2604 if ('\uFFA0' <= c && c <= '\uFFDC')
2606 if ('\u3130' <= c && c <= '\u3164')
2609 if ('\u2776' <= c && c <= '\u277F')
2611 if ('\u2780' <= c && c <= '\u2789')
2613 if ('\u2776' <= c && c <= '\u2793')
2615 if ('\u2160' <= c && c <= '\u216F')
2617 if ('\u2181' <= c && c <= '\u2182')
2620 if ('\u2135' <= c && c <= '\u2138')
2622 if ('\uFE80' <= c && c < '\uFE8E') {
2623 // 2(Isolated)/8(Final)/0x18(Medial)
2624 switch (decompType [(int) c]) {
2625 case DecompositionIsolated:
2627 case DecompositionFinal:
2629 case DecompositionMedial:
2634 // actually I dunno the reason why they have weights.
2657 switch (decompType [(int) c]) {
2658 case DecompositionWide: // <wide>
2659 case DecompositionSub: // <sub>
2660 case DecompositionSuper: // <super>
2661 ret |= decompType [(int) c];
2664 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2666 if (isUppercase [(int) c]) // DerivedCoreProperties
2676 static bool IsIgnorable (int i)
2678 if (unicodeAge [i] >= 3.1)
2680 switch (char.GetUnicodeCategory ((char) i)) {
2681 case UnicodeCategory.OtherNotAssigned:
2682 case UnicodeCategory.Format:
2689 // FIXME: In the future use DerivedAge.txt to examine character
2690 // versions and set those ones that have higher version than
2691 // 1.0 as ignorable.
2692 static bool IsIgnorable (int i)
2696 // I guess, those characters are added between
2697 // Unicode 1.0 (LCMapString) and Unicode 3.1
2698 // (UnicodeCategory), so they used to be
2699 // something like OtherNotAssigned as of Unicode 1.1.
2700 case 0x2df: case 0x387:
2701 case 0x3d7: case 0x3d8: case 0x3d9:
2702 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2703 case 0x400: case 0x40d: case 0x450: case 0x45d:
2704 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2705 case 0x653: case 0x654: case 0x655: case 0x66d:
2707 case 0x1e9b: case 0x202f: case 0x20ad:
2708 case 0x20ae: case 0x20af:
2709 case 0x20e2: case 0x20e3:
2710 case 0x2139: case 0x213a: case 0x2183:
2711 case 0x2425: case 0x2426: case 0x2619:
2712 case 0x2670: case 0x2671: case 0x3007:
2713 case 0x3190: case 0x3191:
2714 case 0xfffc: case 0xfffd:
2716 // exceptional characters filtered by the
2717 // following conditions. Originally those exceptional
2718 // ranges are incorrect (they should not be ignored)
2719 // and most of those characters are unfortunately in
2721 case 0x4d8: case 0x4d9:
2722 case 0x4e8: case 0x4e9:
2723 case 0x3036: case 0x303f:
2724 case 0x337b: case 0xfb1e:
2729 // The whole Sinhala characters.
2730 0x0D82 <= i && i <= 0x0DF4
2731 // The whole Tibetan characters.
2732 || 0x0F00 <= i && i <= 0x0FD1
2733 // The whole Myanmar characters.
2734 || 0x1000 <= i && i <= 0x1059
2735 // The whole Etiopic, Cherokee,
2736 // Canadian Syllablic, Ogham, Runic,
2737 // Tagalog, Hanunoo, Philippine,
2738 // Buhid, Tagbanwa, Khmer and Mongorian
2740 || 0x1200 <= i && i <= 0x1DFF
2741 // Greek extension characters.
2742 || 0x1F00 <= i && i <= 0x1FFF
2743 // The whole Braille characters.
2744 || 0x2800 <= i && i <= 0x28FF
2745 // CJK radical characters.
2746 || 0x2E80 <= i && i <= 0x2EF3
2747 // Kangxi radical characters.
2748 || 0x2F00 <= i && i <= 0x2FD5
2749 // Ideographic description characters.
2750 || 0x2FF0 <= i && i <= 0x2FFB
2751 // Bopomofo letter and final
2752 || 0x31A0 <= i && i <= 0x31B7
2753 // White square with quadrant characters.
2754 || 0x25F0 <= i && i <= 0x25F7
2755 // Ideographic telegraph symbols.
2756 || 0x32C0 <= i && i <= 0x32CB
2757 || 0x3358 <= i && i <= 0x3370
2758 || 0x33E0 <= i && i <= 0x33FF
2759 // The whole YI characters.
2760 || 0xA000 <= i && i <= 0xA48C
2761 || 0xA490 <= i && i <= 0xA4C6
2762 // American small ligatures
2763 || 0xFB13 <= i && i <= 0xFB17
2764 // hebrew, arabic, variation selector.
2765 || 0xFB1D <= i && i <= 0xFE2F
2766 // Arabic ligatures.
2767 || 0xFEF5 <= i && i <= 0xFEFC
2768 // FIXME: why are they excluded?
2769 || 0x01F6 <= i && i <= 0x01F9
2770 || 0x0218 <= i && i <= 0x0233
2771 || 0x02A9 <= i && i <= 0x02AD
2772 || 0x02EA <= i && i <= 0x02EE
2773 || 0x0349 <= i && i <= 0x036F
2774 || 0x0488 <= i && i <= 0x048F
2775 || 0x04D0 <= i && i <= 0x04FF
2776 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2777 || 0x06D6 <= i && i <= 0x06ED
2778 || 0x06FA <= i && i <= 0x06FE
2779 || 0x2048 <= i && i <= 0x204D
2780 || 0x20e4 <= i && i <= 0x20ea
2781 || 0x213C <= i && i <= 0x214B
2782 || 0x21EB <= i && i <= 0x21FF
2783 || 0x22F2 <= i && i <= 0x22FF
2784 || 0x237B <= i && i <= 0x239A
2785 || 0x239B <= i && i <= 0x23CF
2786 || 0x24EB <= i && i <= 0x24FF
2787 || 0x2596 <= i && i <= 0x259F
2788 || 0x25F8 <= i && i <= 0x25FF
2789 || 0x2672 <= i && i <= 0x2689
2790 || 0x2768 <= i && i <= 0x2775
2791 || 0x27d0 <= i && i <= 0x27ff
2792 || 0x2900 <= i && i <= 0x2aff
2793 || 0x3033 <= i && i <= 0x303F
2794 || 0x31F0 <= i && i <= 0x31FF
2795 || 0x3250 <= i && i <= 0x325F
2796 || 0x32B1 <= i && i <= 0x32BF
2797 || 0x3371 <= i && i <= 0x337B
2798 || 0xFA30 <= i && i <= 0xFA6A
2802 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2804 case UnicodeCategory.PrivateUse:
2805 case UnicodeCategory.Surrogate:
2807 // ignored by nature
2808 case UnicodeCategory.Format:
2809 case UnicodeCategory.OtherNotAssigned:
2816 // To check IsIgnorable sanity, try the driver below under MS.NET.
2819 public static void Main ()
2821 for (int i = 0; i <= char.MaxValue; i++)
2822 Dump (i, IsIgnorable (i));
2825 static void Dump (int i, bool ignore)
2827 switch (Char.GetUnicodeCategory ((char) i)) {
2828 case UnicodeCategory.PrivateUse:
2829 case UnicodeCategory.Surrogate:
2830 return; // check nothing
2834 string s2 = new string ((char) i, 10);
2835 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2836 if ((ret == 0) == ignore)
2838 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2841 #endregion // IsIgnorable
2843 #region IsIgnorableSymbol
2844 static bool IsIgnorableSymbol (int i)
2846 if (IsIgnorable (i))
2851 case 0x00b5: case 0x01C0: case 0x01C1:
2852 case 0x01C2: case 0x01C3: case 0x01F6:
2853 case 0x01F7: case 0x01F8: case 0x01F9:
2854 case 0x02D0: case 0x02EE: case 0x037A:
2855 case 0x03D7: case 0x03F3:
2856 case 0x0400: case 0x040d:
2857 case 0x0450: case 0x045d:
2858 case 0x048C: case 0x048D:
2859 case 0x048E: case 0x048F:
2860 case 0x0587: case 0x0640: case 0x06E5:
2861 case 0x06E6: case 0x06FA: case 0x06FB:
2862 case 0x06FC: case 0x093D: case 0x0950:
2863 case 0x1E9B: case 0x2139: case 0x3006:
2864 case 0x3033: case 0x3034: case 0x3035:
2865 case 0xFE7E: case 0xFE7F:
2867 case 0x16EE: case 0x16EF: case 0x16F0:
2869 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2870 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2871 case 0x3038: // HANGZHOU NUMERAL TEN
2872 case 0x3039: // HANGZHOU NUMERAL TWENTY
2873 case 0x303a: // HANGZHOU NUMERAL THIRTY
2879 case 0x02B9: case 0x02BA: case 0x02C2:
2880 case 0x02C3: case 0x02C4: case 0x02C5:
2881 case 0x02C8: case 0x02CC: case 0x02CD:
2882 case 0x02CE: case 0x02CF: case 0x02D2:
2883 case 0x02D3: case 0x02D4: case 0x02D5:
2884 case 0x02D6: case 0x02D7: case 0x02DE:
2885 case 0x02E5: case 0x02E6: case 0x02E7:
2886 case 0x02E8: case 0x02E9:
2887 case 0x309B: case 0x309C:
2889 case 0x055A: // American Apos
2890 case 0x05C0: // Hebrew Punct
2891 case 0x0E4F: // Thai FONGMAN
2892 case 0x0E5A: // Thai ANGKHANKHU
2893 case 0x0E5B: // Thai KHOMUT
2895 case 0x09F2: // Bengali Rupee Mark
2896 case 0x09F3: // Bengali Rupee Sign
2898 case 0x221e: // INF.
2907 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2909 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2910 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2915 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2917 case UnicodeCategory.Surrogate:
2918 return false; // inconsistent
2920 case UnicodeCategory.SpacingCombiningMark:
2921 case UnicodeCategory.EnclosingMark:
2922 case UnicodeCategory.NonSpacingMark:
2923 case UnicodeCategory.PrivateUse:
2925 if (0x064B <= i && i <= 0x0652) // Arabic
2929 case UnicodeCategory.Format:
2930 case UnicodeCategory.OtherNotAssigned:
2937 // latin in a circle
2938 0x249A <= i && i <= 0x24E9
2939 || 0x2100 <= i && i <= 0x2132
2941 || 0x3196 <= i && i <= 0x31A0
2943 || 0x3200 <= i && i <= 0x321C
2945 || 0x322A <= i && i <= 0x3243
2947 || 0x3260 <= i && i <= 0x32B0
2948 || 0x32D0 <= i && i <= 0x3357
2949 || 0x337B <= i && i <= 0x33DD
2951 use = !Char.IsLetterOrDigit ((char) i);
2955 // This "Digit" rule is mystery.
2956 // It filters some symbols out.
2957 if (Char.IsLetterOrDigit ((char) i))
2959 if (Char.IsNumber ((char) i))
2961 if (Char.IsControl ((char) i)
2962 || Char.IsSeparator ((char) i)
2963 || Char.IsPunctuation ((char) i))
2965 if (Char.IsSymbol ((char) i))
2968 // FIXME: should check more
2973 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2975 public static void Main ()
2977 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2978 for (int i = 0; i <= char.MaxValue; i++) {
2979 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2980 if (uc == UnicodeCategory.Surrogate)
2983 bool ret = IsIgnorableSymbol (i);
2985 string s1 = "TEST ";
2986 string s2 = "TEST " + (char) i;
2988 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2990 if (ret != (result == 0))
2991 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2992 ret ? "should not ignore" :
3001 static bool IsIgnorableNonSpacing (int i)
3003 if (IsIgnorable (i))
3007 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3008 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3009 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3011 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3012 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3013 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3014 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3015 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3016 case 0x0CCD: case 0x0E4E:
3020 if (0x02b9 <= i && i <= 0x02c5
3021 || 0x02cc <= i && i <= 0x02d7
3022 || 0x02e4 <= i && i <= 0x02ef
3023 || 0x20DD <= i && i <= 0x20E0
3027 if (0x064B <= i && i <= 0x00652
3028 || 0x0941 <= i && i <= 0x0948
3029 || 0x0AC1 <= i && i <= 0x0ACD
3030 || 0x0C3E <= i && i <= 0x0C4F
3031 || 0x0E31 <= i && i <= 0x0E3F
3035 return Char.GetUnicodeCategory ((char) i) ==
3036 UnicodeCategory.NonSpacingMark;
3039 // We can reuse IsIgnorableSymbol testcode
3040 // for IsIgnorableNonSpacing.
3046 public byte Category;
3048 public byte Level2; // It is always single byte.
3049 public bool Defined;
3051 public CharMapEntry (byte category, byte level1, byte level2)
3053 Category = category;
3062 public readonly int CP;
3063 public readonly int JIS;
3065 public JISCharacter (int cp, int cpJIS)
3072 class JISComparer : IComparer
3074 public static readonly JISComparer Instance =
3077 public int Compare (object o1, object o2)
3079 JISCharacter j1 = (JISCharacter) o1;
3080 JISCharacter j2 = (JISCharacter) o2;
3081 return j2.JIS - j1.JIS;
3085 class NonJISCharacter
3087 public readonly int CP;
3088 public readonly string Name;
3090 public NonJISCharacter (int cp, string name)
3097 class NonJISComparer : IComparer
3099 public static readonly NonJISComparer Instance =
3100 new NonJISComparer ();
3102 public int Compare (object o1, object o2)
3104 NonJISCharacter j1 = (NonJISCharacter) o1;
3105 NonJISCharacter j2 = (NonJISCharacter) o2;
3106 return string.CompareOrdinal (j1.Name, j2.Name);
3110 class DecimalDictionaryValueComparer : IComparer
3112 public static readonly DecimalDictionaryValueComparer Instance
3113 = new DecimalDictionaryValueComparer ();
3115 private DecimalDictionaryValueComparer ()
3119 public int Compare (object o1, object o2)
3121 DictionaryEntry e1 = (DictionaryEntry) o1;
3122 DictionaryEntry e2 = (DictionaryEntry) o2;
3123 // FIXME: in case of 0, compare decomposition categories
3124 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3127 int i1 = (int) e1.Key;
3128 int i2 = (int) e2.Key;
3133 class StringDictionaryValueComparer : IComparer
3135 public static readonly StringDictionaryValueComparer Instance
3136 = new StringDictionaryValueComparer ();
3138 private StringDictionaryValueComparer ()
3142 public int Compare (object o1, object o2)
3144 DictionaryEntry e1 = (DictionaryEntry) o1;
3145 DictionaryEntry e2 = (DictionaryEntry) o2;
3146 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3149 int i1 = (int) e1.Key;
3150 int i2 = (int) e2.Key;
3155 class UCAComparer : IComparer
3157 public static readonly UCAComparer Instance
3158 = new UCAComparer ();
3160 private UCAComparer ()
3164 public int Compare (object o1, object o2)
3166 char i1 = (char) o1;
3167 char i2 = (char) o2;
3169 int l1 = CollationElementTable.GetSortKeyCount (i1);
3170 int l2 = CollationElementTable.GetSortKeyCount (i2);
3171 int l = l1 > l2 ? l2 : l1;
3173 for (int i = 0; i < l; i++) {
3174 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3175 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3176 int v = k1.Primary - k2.Primary;
3179 v = k1.Secondary - k2.Secondary;
3182 v = k1.Thirtiary - k2.Thirtiary;
3185 v = k1.Quarternary - k2.Quarternary;
3198 ArrayList items = new ArrayList ();
3200 public Tailoring (int lcid)
3205 public Tailoring (int lcid, int alias)
3212 get { return lcid; }
3216 get { return alias; }
3219 public bool FrenchSort {
3220 get { return frenchSort; }
3221 set { frenchSort = value; }
3224 public void AddDiacriticalMap (byte target, byte replace)
3226 items.Add (new DiacriticalMap (target, replace));
3229 public void AddSortKeyMap (string source, byte [] sortkey)
3231 items.Add (new SortKeyMap (source, sortkey));
3234 public void AddReplacementMap (string source, string replace)
3236 items.Add (new ReplacementMap (source, replace));
3239 public char [] ItemToCharArray ()
3241 ArrayList al = new ArrayList ();
3242 foreach (ITailoringMap m in items)
3243 al.AddRange (m.ToCharArray ());
3244 return al.ToArray (typeof (char)) as char [];
3247 interface ITailoringMap
3249 char [] ToCharArray ();
3252 class DiacriticalMap : ITailoringMap
3254 public readonly byte Target;
3255 public readonly byte Replace;
3257 public DiacriticalMap (byte target, byte replace)
3263 public char [] ToCharArray ()
3265 char [] ret = new char [3];
3266 ret [0] = (char) 02; // kind:DiacriticalMap
3267 ret [1] = (char) Target;
3268 ret [2] = (char) Replace;
3273 class SortKeyMap : ITailoringMap
3275 public readonly string Source;
3276 public readonly byte [] SortKey;
3278 public SortKeyMap (string source, byte [] sortkey)
3284 public char [] ToCharArray ()
3286 char [] ret = new char [Source.Length + 7];
3287 ret [0] = (char) 01; // kind:SortKeyMap
3288 for (int i = 0; i < Source.Length; i++)
3289 ret [i + 1] = Source [i];
3291 for (int i = 0; i < 5; i++)
3292 ret [i + Source.Length + 2] = (char) SortKey [i];
3297 class ReplacementMap : ITailoringMap
3299 public readonly string Source;
3300 public readonly string Replace;
3302 public ReplacementMap (string source, string replace)
3308 public char [] ToCharArray ()
3310 char [] ret = new char [Source.Length + Replace.Length + 3];
3311 ret [0] = (char) 03; // kind:ReplaceMap
3313 for (int i = 0; i < Source.Length; i++)
3314 ret [pos++] = Source [i];
3317 for (int i = 0; i < Replace.Length; i++)
3318 ret [pos++] = Replace [i];