3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
30 using System.Collections;
31 using System.Globalization;
35 namespace Mono.Globalization.Unicode
37 internal class MSCompatSortKeyTableGenerator
39 public static void Main (string [] args)
41 new MSCompatSortKeyTableGenerator ().Run (args);
44 const int DecompositionWide = 1; // fixed
45 const int DecompositionSub = 2; // fixed
46 const int DecompositionSmall = 3;
47 const int DecompositionIsolated = 4;
48 const int DecompositionInitial = 5;
49 const int DecompositionFinal = 6;
50 const int DecompositionMedial = 7;
51 const int DecompositionNoBreak = 8;
52 const int DecompositionVertical = 9;
53 const int DecompositionFraction = 0xA;
54 const int DecompositionFont = 0xB;
55 const int DecompositionSuper = 0xC; // fixed
56 const int DecompositionFull = 0xE;
57 const int DecompositionNarrow = 0xD;
58 const int DecompositionCircle = 0xF;
59 const int DecompositionSquare = 0x10;
60 const int DecompositionCompat = 0x11;
61 const int DecompositionCanonical = 0x12;
63 TextWriter Result = Console.Out;
65 byte [] fillIndex = new byte [256]; // by category
66 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
68 char [] specialIgnore = new char [] {
69 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
70 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
73 // FIXME: need more love (as always)
74 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
75 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
76 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
77 '\u0292', '\u01BE', '\u0298'};
78 byte [] alphaWeights = new byte [] {
79 2, 9, 0xA, 0x1A, 0x21,
80 0x23, 0x25, 0x2C, 0x32, 0x35,
81 0x36, 0x48, 0x51, 0x70, 0x7C,
82 0x7E, 0x89, 0x8A, 0x91, 0x99,
83 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
84 0xA9, 0xAA, 0xB3, 0xB4};
86 bool [] isSmallCapital = new bool [char.MaxValue + 1];
87 bool [] isUppercase = new bool [char.MaxValue + 1];
89 byte [] decompType = new byte [char.MaxValue + 1];
90 int [] decompIndex = new int [char.MaxValue + 1];
91 int [] decompLength = new int [char.MaxValue + 1];
93 decimal [] decimalValue = new decimal [char.MaxValue + 1];
95 byte [] diacritical = new byte [char.MaxValue + 1];
97 string [] diacritics = new string [] {
99 "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
100 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
101 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
102 " OGONEK;", " CEDILLA;",
103 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
104 " STROKE;", " CIRCUMFLEX AND ACUTE;",
105 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
106 " DIAERESIS AND GRAVE;",
108 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
109 " MACRON AND ACUTE;",
110 " MACRON AND GRAVE;",
111 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
112 " RING ABOVE AND ACUTE",
113 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
114 " CIRCUMFLEX AND TILDE",
115 " TILDE AND DIAERESIS",
118 " CEDILLA AND BREVE",
119 " OGONEK AND MACRON",
120 " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
123 " PRECEDED BY APOSTROPHE",
125 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
128 " RETROFLEX;", "DIAERESIS BELOW",
130 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
131 " BREVE BELOW;", " HORN AND GRAVE",
133 " DOT BELOW AND DOT ABOVE",
134 " RIGHT HALF RING", " HORN AND TILDE",
135 " CIRCUMFLEX AND DOT BELOW",
136 " BREVE AND DOT BELOW",
137 " DOT BELOW AND MACRON",
138 " HORN AND HOOK ABOVE",
140 // CIRCLED, PARENTHESIZED and so on
141 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
142 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
144 byte [] diacriticWeights = new byte [] {
146 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
147 0x17, 0x19, 0x1A, 0x1B, 0x1C,
148 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
149 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
150 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
151 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
152 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
153 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
154 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
155 0x69, 0x69, 0x6A, 0x6D, 0x6E,
157 // CIRCLED, PARENTHESIZED and so on.
158 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
161 int [] numberSecondaryWeightBounds = new int [] {
162 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
163 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
164 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
165 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
166 0xE50, 0xE60, 0xED0, 0xEE0
169 char [] orderedCyrillic;
170 char [] orderedGurmukhi;
171 char [] orderedGujarati;
172 char [] orderedGeorgian;
173 char [] orderedThaana;
175 static readonly char [] orderedTamilConsonants = new char [] {
176 // based on traditional Tamil consonants, except for
177 // Grantha (where Microsoft breaks traditionalism).
178 // http://www.angelfire.com/empire/thamizh/padanGaL
179 '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
180 '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
181 '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
182 '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
185 // cp -> character name (only for some characters)
186 ArrayList sortableCharNames = new ArrayList ();
188 // cp -> arrow value (int)
189 ArrayList arrowValues = new ArrayList ();
191 // cp -> box value (int)
192 ArrayList boxValues = new ArrayList ();
194 // cp -> level1 value
195 Hashtable arabicLetterPrimaryValues = new Hashtable ();
198 Hashtable arabicNameMap = new Hashtable ();
200 // cp -> Hashtable [decompType] -> cp
201 Hashtable nfkdMap = new Hashtable ();
203 // Latin letter -> ArrayList [int]
204 Hashtable latinMap = new Hashtable ();
206 ArrayList jisJapanese = new ArrayList ();
207 ArrayList nonJisJapanese = new ArrayList ();
209 ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
210 ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
211 ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
212 ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
213 byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
215 byte [] ignorableFlags = new byte [char.MaxValue + 1];
217 static double [] unicodeAge = new double [char.MaxValue + 1];
219 ArrayList tailorings = new ArrayList ();
221 void Run (string [] args)
223 string dirname = args.Length == 0 ? "downloaded" : args [0];
224 ParseSources (dirname);
225 Console.Error.WriteLine ("parse done.");
227 ModifyParsedValues ();
229 Console.Error.WriteLine ("generation done.");
231 Console.Error.WriteLine ("serialization done.");
233 StreamWriter sw = new StreamWriter ("agelog.txt");
234 for (int i = 0; i < char.MaxValue; i++) {
235 bool shouldBe = false;
236 switch (Char.GetUnicodeCategory ((char) i)) {
237 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
238 shouldBe = true; break;
240 if (unicodeAge [i] >= 3.1)
242 //if (IsIgnorable (i) != shouldBe)
243 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
249 byte [] CompressArray (byte [] source, CodePointIndexer i)
251 return (byte []) CodePointIndexer.CompressArray (
252 source, typeof (byte), i);
258 SerializeTailorings ();
260 byte [] categories = new byte [map.Length];
261 byte [] level1 = new byte [map.Length];
262 byte [] level2 = new byte [map.Length];
263 byte [] level3 = new byte [map.Length];
264 int [] widthCompat = new int [map.Length];
265 for (int i = 0; i < map.Length; i++) {
266 categories [i] = map [i].Category;
267 level1 [i] = map [i].Level1;
268 level2 [i] = map [i].Level2;
269 level3 [i] = ComputeLevel3Weight ((char) i);
270 switch (decompType [i]) {
271 case DecompositionNarrow:
272 case DecompositionWide:
273 case DecompositionSuper:
274 case DecompositionSub:
275 // they are always 1 char
276 widthCompat [i] = decompValues [decompIndex [i]];
282 ignorableFlags = CompressArray (ignorableFlags,
283 MSCompatUnicodeTableUtil.Ignorable);
284 categories = CompressArray (categories,
285 MSCompatUnicodeTableUtil.Category);
286 level1 = CompressArray (level1,
287 MSCompatUnicodeTableUtil.Level1);
288 level2 = CompressArray (level2,
289 MSCompatUnicodeTableUtil.Level2);
290 level3 = CompressArray (level3,
291 MSCompatUnicodeTableUtil.Level3);
292 widthCompat = (int []) CodePointIndexer.CompressArray (
293 widthCompat, typeof (int),
294 MSCompatUnicodeTableUtil.WidthCompat);
297 Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
298 for (int i = 0; i < ignorableFlags.Length; i++) {
299 byte value = ignorableFlags [i];
301 Result.Write ("{0},", value);
303 Result.Write ("0x{0:X02},", value);
304 if ((i & 0xF) == 0xF)
305 Result.WriteLine ("// {0:X04}", i - 0xF);
307 Result.WriteLine ("};");
311 Result.WriteLine ("static byte [] categories = new byte [] {");
312 for (int i = 0; i < categories.Length; i++) {
313 byte value = categories [i];
315 Result.Write ("{0},", value);
317 Result.Write ("0x{0:X02},", value);
318 if ((i & 0xF) == 0xF)
319 Result.WriteLine ("// {0:X04}", i - 0xF);
321 Result.WriteLine ("};");
324 // Primary weight value
325 Result.WriteLine ("static byte [] level1 = new byte [] {");
326 for (int i = 0; i < level1.Length; i++) {
327 byte value = level1 [i];
329 Result.Write ("{0},", value);
331 Result.Write ("0x{0:X02},", value);
332 if ((i & 0xF) == 0xF)
333 Result.WriteLine ("// {0:X04}", i - 0xF);
335 Result.WriteLine ("};");
339 Result.WriteLine ("static byte [] level2 = new byte [] {");
340 for (int i = 0; i < level2.Length; i++) {
341 int value = level2 [i];
343 Result.Write ("{0},", value);
345 Result.Write ("0x{0:X02},", value);
346 if ((i & 0xF) == 0xF)
347 Result.WriteLine ("// {0:X04}", i - 0xF);
349 Result.WriteLine ("};");
353 Result.WriteLine ("static byte [] level3 = new byte [] {");
354 for (int i = 0; i < level3.Length; i++) {
355 byte value = level3 [i];
357 Result.Write ("{0},", value);
359 Result.Write ("0x{0:X02},", value);
360 if ((i & 0xF) == 0xF)
361 Result.WriteLine ("// {0:X04}", i - 0xF);
363 Result.WriteLine ("};");
366 // Width insensitivity mappings
367 // (for now it is more lightweight than dumping the
368 // entire NFKD table).
369 Result.WriteLine ("static int [] widthCompat = new int [] {");
370 for (int i = 0; i < widthCompat.Length; i++) {
371 int value = widthCompat [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
376 if ((i & 0xF) == 0xF)
377 Result.WriteLine ("// {0:X04}", i - 0xF);
379 Result.WriteLine ("};");
383 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
384 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
385 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
386 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
387 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
390 void SerializeCJK (string name, ushort [] cjk, int max)
392 int offset = char.MaxValue - cjk.Length;
393 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
394 for (int i = 0; i < cjk.Length; i++) {
395 if (i + offset == max)
397 ushort value = cjk [i];
399 Result.Write ("{0},", value);
401 Result.Write ("0x{0:X04},", value);
402 if ((i & 0xF) == 0xF)
403 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
405 Result.WriteLine ("};");
409 void SerializeCJK (string name, byte [] cjk, int max)
411 int offset = char.MaxValue - cjk.Length;
412 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
413 for (int i = 0; i < cjk.Length; i++) {
414 if (i + offset == max)
416 byte value = cjk [i];
418 Result.Write ("{0},", value);
420 Result.Write ("0x{0:X02},", value);
421 if ((i & 0xF) == 0xF)
422 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
424 Result.WriteLine ("};");
428 void SerializeTailorings ()
430 Hashtable indexes = new Hashtable ();
431 Hashtable counts = new Hashtable ();
432 Result.WriteLine ("static char [] tailorings = new char [] {");
434 foreach (Tailoring t in tailorings) {
437 Result.Write ("/*{0}*/", t.LCID);
438 indexes.Add (t.LCID, count);
439 char [] values = t.ItemToCharArray ();
440 counts.Add (t.LCID, values.Length);
441 foreach (char c in values) {
442 Result.Write ("'\\x{0:X}', ", (int) c);
443 if (++count % 16 == 0)
444 Result.WriteLine (" // {0:X04}", count - 16);
447 Result.WriteLine ("};");
449 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
450 foreach (Tailoring t in tailorings) {
451 int target = t.Alias != 0 ? t.Alias : t.LCID;
452 if (!indexes.ContainsKey (target)) {
453 Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
456 int idx = (int) indexes [target];
457 int cnt = (int) counts [target];
458 bool french = t.FrenchSort;
460 foreach (Tailoring t2 in tailorings)
461 if (t2.LCID == t.LCID)
462 french = t2.FrenchSort;
463 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
465 Result.WriteLine ("};");
470 void ParseSources (string dirname)
473 dirname + "/UnicodeData.txt";
474 string derivedCoreProps =
475 dirname + "/DerivedCoreProperties.txt";
477 dirname + "/Scripts.txt";
479 dirname + "/CP932.TXT";
481 dirname + "/DerivedAge.txt";
482 string chXML = dirname + "/common/collation/zh.xml";
483 string jaXML = dirname + "/common/collation/ja.xml";
484 string koXML = dirname + "/common/collation/ko.xml";
486 ParseDerivedAge (derivedAge);
490 ParseJISOrder (cp932); // in prior to ParseUnidata()
491 ParseUnidata (unidata);
492 ParseDerivedCoreProperties (derivedCoreProps);
493 ParseScripts (scripts);
494 ParseCJK (chXML, jaXML, koXML);
496 ParseTailorings ("mono-tailoring-source.txt");
499 void ParseTailorings (string filename)
503 using (StreamReader sr = new StreamReader (filename)) {
505 while (sr.Peek () >= 0) {
507 ProcessTailoringLine (ref t,
508 sr.ReadLine ().Trim ());
510 } catch (Exception) {
511 Console.Error.WriteLine ("ERROR at line {0}", line);
517 // For now this is enough.
518 string ParseTailoringSourceValue (string s)
520 StringBuilder sb = new StringBuilder ();
521 for (int i = 0; i < s.Length; i++) {
522 if (s.StartsWith ("\\u")) {
523 sb.Append ((char) int.Parse (
524 s.Substring (2, 4), NumberStyles.HexNumber),
531 return sb.ToString ();
534 void ProcessTailoringLine (ref Tailoring t, string s)
536 int idx = s.IndexOf ('#');
538 s = s.Substring (0, idx).Trim ();
539 if (s.Length == 0 || s [0] == '#')
542 idx = s.IndexOf ('=');
545 int.Parse (s.Substring (1, idx - 1)),
546 int.Parse (s.Substring (idx + 1)));
548 t = new Tailoring (int.Parse (s.Substring (1)));
552 if (s.StartsWith ("*FrenchSort")) {
556 string d = "*Diacritical";
557 if (s.StartsWith (d)) {
558 idx = s.IndexOf ("->");
559 t.AddDiacriticalMap (
560 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
561 NumberStyles.HexNumber),
562 byte.Parse (s.Substring (idx + 2).Trim (),
563 NumberStyles.HexNumber));
566 idx = s.IndexOf (':');
568 string source = s.Substring (0, idx).Trim ();
569 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
570 byte [] b = new byte [5];
571 for (int i = 0; i < 5; i++) {
575 b [i] = byte.Parse (l [i],
576 NumberStyles.HexNumber);
578 t.AddSortKeyMap (ParseTailoringSourceValue (source),
581 idx = s.IndexOf ('=');
583 t.AddReplacementMap (
584 ParseTailoringSourceValue (
585 s.Substring (0, idx).Trim ()),
586 ParseTailoringSourceValue (
587 s.Substring (idx + 1).Trim ()));
590 void ParseDerivedAge (string filename)
592 using (StreamReader file =
593 new StreamReader (filename)) {
594 while (file.Peek () >= 0) {
595 string s = file.ReadLine ();
596 int idx = s.IndexOf ('#');
598 s = s.Substring (0, idx);
599 idx = s.IndexOf (';');
603 string cpspec = s.Substring (0, idx);
604 idx = cpspec.IndexOf ("..");
605 NumberStyles nf = NumberStyles.HexNumber |
606 NumberStyles.AllowTrailingWhite;
607 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
608 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
609 string value = s.Substring (cpspec.Length + 1).Trim ();
612 if (cp > char.MaxValue)
615 for (int i = cp; i <= cpEnd; i++)
616 unicodeAge [i] = double.Parse (value);
619 unicodeAge [0] = double.MaxValue; // never be supported
622 void ParseUnidata (string filename)
624 ArrayList decompValues = new ArrayList ();
625 using (StreamReader unidata =
626 new StreamReader (filename)) {
627 for (int line = 1; unidata.Peek () >= 0; line++) {
629 ProcessUnidataLine (unidata.ReadLine (), decompValues);
630 } catch (Exception) {
631 Console.Error.WriteLine ("**** At line " + line);
636 this.decompValues = (int [])
637 decompValues.ToArray (typeof (int));
640 void ProcessUnidataLine (string s, ArrayList decompValues)
642 int idx = s.IndexOf ('#');
644 s = s.Substring (0, idx);
645 idx = s.IndexOf (';');
648 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
649 string [] values = s.Substring (idx + 1).Split (';');
652 if (cp > char.MaxValue)
654 if (IsIgnorable (cp))
657 string name = values [0];
660 if (s.IndexOf ("SMALL CAPITAL") > 0)
661 isSmallCapital [cp] = true;
663 // latin mapping by character name
664 if (s.IndexOf ("LATIN") > 0) {
665 int lidx = s.IndexOf ("LETTER DOTLESS ");
666 int offset = lidx + 15;
668 lidx = s.IndexOf ("LETTER TURNED ");
672 lidx = s.IndexOf ("LETTER ");
675 char c = lidx > 0 ? s [offset] : char.MinValue;
676 if ('A' <= c && c <= 'Z' &&
677 (s.Length == offset + 1 || s [offset + 1] == ' ')) {
678 ArrayList entry = (ArrayList) latinMap [c];
680 entry = new ArrayList ();
681 latinMap [c] = entry;
688 if (0x2000 <= cp && cp < 0x3000) {
690 // SPECIAL CASES. FIXME: why?
692 case 0x21C5: value = -1; break; // E2
693 case 0x261D: value = 1; break;
694 case 0x27A6: value = 3; break;
695 case 0x21B0: value = 7; break;
696 case 0x21B1: value = 3; break;
697 case 0x21B2: value = 7; break;
698 case 0x21B4: value = 5; break;
699 case 0x21B5: value = 7; break;
700 case 0x21B9: value = -1; break; // E1
701 case 0x21CF: value = 7; break;
702 case 0x21D0: value = 3; break;
704 string [] arrowTargets = new string [] {
716 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
717 if (s.IndexOf (arrowTargets [i]) > 0 &&
718 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
719 s.IndexOf (" OVER") < 0
723 arrowValues.Add (new DictionaryEntry (
728 if (0x2500 <= cp && cp < 0x25B0) {
731 // up:1 down:2 right:4 left:8 vert:16 horiz:32
734 // [dr] [dl] [ur] [ul]
738 ArrayList flags = new ArrayList (new int [] {
741 4 + 2, 8 + 2, 4 + 1, 8 + 1,
742 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
743 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
744 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
746 byte [] offsets = new byte [] {
753 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
755 if (s.IndexOf (" UP") > 0)
757 if (s.IndexOf (" DOWN") > 0)
759 if (s.IndexOf (" RIGHT") > 0)
761 if (s.IndexOf (" LEFT") > 0)
763 if (s.IndexOf (" VERTICAL") > 0)
765 if (s.IndexOf (" HORIZONTAL") > 0)
768 int fidx = flags.IndexOf (flag);
769 value = fidx < 0 ? fidx : offsets [fidx];
770 } else if (s.IndexOf ("BLOCK") > 0) {
771 if (s.IndexOf ("ONE EIGHTH") > 0)
773 else if (s.IndexOf ("ONE QUARTER") > 0)
775 else if (s.IndexOf ("THREE EIGHTHS") > 0)
777 else if (s.IndexOf ("HALF") > 0)
779 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
781 else if (s.IndexOf ("THREE QUARTERS") > 0)
783 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
789 boxValues.Add (new DictionaryEntry (
793 // For some characters store the name and sort later
794 // to determine sorting.
795 if (0x2100 <= cp && cp <= 0x213F &&
796 Char.IsSymbol ((char) cp))
797 sortableCharNames.Add (
798 new DictionaryEntry (cp, values [0]));
799 else if (0x3380 <= cp && cp <= 0x33DD)
800 sortableCharNames.Add (new DictionaryEntry (
801 cp, values [0].Substring (7)));
803 // diacritical weights by character name
804 for (int d = 0; d < diacritics.Length; d++)
805 if (s.IndexOf (diacritics [d]) > 0)
806 diacritical [cp] |= diacriticWeights [d];
807 // Two-step grep required for it.
808 if (s.IndexOf ("FULL STOP") > 0 &&
809 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
810 diacritical [cp] |= 0xF4;
812 // Arabic letter name
813 if (0x0621 <= cp && cp <= 0x064A &&
814 Char.GetUnicodeCategory ((char) cp)
815 == UnicodeCategory.OtherLetter) {
816 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
821 // hamza, waw, yeh ... special cases.
826 value = 0x77; // special cases.
829 // Get primary letter name i.e.
830 // XXX part of ARABIC LETTER XXX yyy
831 // e.g. that of "TEH MARBUTA" is "TEH".
834 // 0x0640 is special: it does
835 // not start with ARABIC LETTER
837 values [0].Substring (14);
838 int tmpIdx = letterName.IndexOf (' ');
839 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
840 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
841 if (arabicNameMap.ContainsKey (letterName))
842 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
844 arabicNameMap [letterName] = cp;
847 arabicLetterPrimaryValues [cp] = value;
850 // Japanese square letter
851 if (0x3300 <= cp && cp <= 0x3357)
853 nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
856 string decomp = values [4];
857 idx = decomp.IndexOf ('<');
859 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
861 decompType [cp] = DecompositionFull;
864 decompType [cp] = DecompositionSub;
867 decompType [cp] = DecompositionSuper;
870 decompType [cp] = DecompositionSmall;
873 decompType [cp] = DecompositionIsolated;
876 decompType [cp] = DecompositionInitial;
879 decompType [cp] = DecompositionFinal;
882 decompType [cp] = DecompositionMedial;
885 decompType [cp] = DecompositionNoBreak;
888 decompType [cp] = DecompositionCompat;
891 decompType [cp] = DecompositionFraction;
894 decompType [cp] = DecompositionFont;
897 decompType [cp] = DecompositionCircle;
900 decompType [cp] = DecompositionSquare;
903 decompType [cp] = DecompositionWide;
906 decompType [cp] = DecompositionNarrow;
909 decompType [cp] = DecompositionVertical;
912 throw new Exception ("Support NFKD type : " + decomp);
916 decompType [cp] = DecompositionCanonical;
917 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
918 if (decomp.Length > 0) {
920 string [] velems = decomp.Split (' ');
921 int didx = decompValues.Count;
922 decompIndex [cp] = didx;
923 foreach (string v in velems)
924 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
925 decompLength [cp] = velems.Length;
927 // [decmpType] -> this_cp
928 int targetCP = (int) decompValues [didx];
929 // for "(x)" it specially maps to 'x' .
930 // FIXME: check if it is sane
931 if (velems.Length == 3 &&
932 (int) decompValues [didx] == '(' &&
933 (int) decompValues [didx + 2] == ')')
934 targetCP = (int) decompValues [didx + 1];
935 // special: 0x215F "1/"
936 else if (cp == 0x215F)
938 else if (velems.Length > 1 &&
939 (targetCP < 0x4C00 || 0x9FBB < targetCP))
940 // skip them, except for CJK ideograph compat
944 Hashtable entry = (Hashtable) nfkdMap [targetCP];
946 entry = new Hashtable ();
947 nfkdMap [targetCP] = entry;
949 entry [(byte) decompType [cp]] = cp;
953 if (values [5].Length > 0)
954 decimalValue [cp] = decimal.Parse (values [5]);
955 else if (values [6].Length > 0)
956 decimalValue [cp] = decimal.Parse (values [6]);
957 else if (values [7].Length > 0) {
958 string decstr = values [7];
959 idx = decstr.IndexOf ('/');
960 if (cp == 0x215F) // special. "1/"
961 decimalValue [cp] = 0x1;
965 decimal.Parse (decstr.Substring (0, idx))
966 / decimal.Parse (decstr.Substring (idx + 1));
967 else if (decstr [0] == '(' &&
968 decstr [decstr.Length - 1] == ')')
971 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
972 else if (decstr [decstr.Length - 1] == '.')
975 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
977 decimalValue [cp] = decimal.Parse (decstr);
981 void ParseDerivedCoreProperties (string filename)
984 using (StreamReader file =
985 new StreamReader (filename)) {
986 for (int line = 1; file.Peek () >= 0; line++) {
988 ProcessDerivedCorePropLine (file.ReadLine ());
989 } catch (Exception) {
990 Console.Error.WriteLine ("**** At line " + line);
997 void ProcessDerivedCorePropLine (string s)
999 int idx = s.IndexOf ('#');
1001 s = s.Substring (0, idx);
1002 idx = s.IndexOf (';');
1005 string cpspec = s.Substring (0, idx);
1006 idx = cpspec.IndexOf ("..");
1007 NumberStyles nf = NumberStyles.HexNumber |
1008 NumberStyles.AllowTrailingWhite;
1009 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1010 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1011 string value = s.Substring (cpspec.Length + 1).Trim ();
1014 if (cp > char.MaxValue)
1019 for (int x = cp; x <= cpEnd; x++)
1020 isUppercase [x] = true;
1025 void ParseScripts (string filename)
1027 ArrayList cyrillic = new ArrayList ();
1028 ArrayList gurmukhi = new ArrayList ();
1029 ArrayList gujarati = new ArrayList ();
1030 ArrayList georgian = new ArrayList ();
1031 ArrayList thaana = new ArrayList ();
1033 using (StreamReader file =
1034 new StreamReader (filename)) {
1035 while (file.Peek () >= 0) {
1036 string s = file.ReadLine ();
1037 int idx = s.IndexOf ('#');
1039 s = s.Substring (0, idx);
1040 idx = s.IndexOf (';');
1044 string cpspec = s.Substring (0, idx);
1045 idx = cpspec.IndexOf ("..");
1046 NumberStyles nf = NumberStyles.HexNumber |
1047 NumberStyles.AllowTrailingWhite;
1048 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1049 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1050 string value = s.Substring (cpspec.Length + 1).Trim ();
1053 if (cp > char.MaxValue)
1058 for (int x = cp; x <= cpEnd; x++)
1059 if (!IsIgnorable (x))
1060 cyrillic.Add ((char) x);
1063 for (int x = cp; x <= cpEnd; x++)
1064 if (!IsIgnorable (x))
1065 gurmukhi.Add ((char) x);
1068 for (int x = cp; x <= cpEnd; x++)
1069 if (!IsIgnorable (x))
1070 gujarati.Add ((char) x);
1073 for (int x = cp; x <= cpEnd; x++)
1074 if (!IsIgnorable (x))
1075 georgian.Add ((char) x);
1078 for (int x = cp; x <= cpEnd; x++)
1079 if (!IsIgnorable (x))
1080 thaana.Add ((char) x);
1085 cyrillic.Sort (UCAComparer.Instance);
1086 gurmukhi.Sort (UCAComparer.Instance);
1087 gujarati.Sort (UCAComparer.Instance);
1088 georgian.Sort (UCAComparer.Instance);
1089 thaana.Sort (UCAComparer.Instance);
1090 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1091 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1092 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1093 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1094 orderedThaana = (char []) thaana.ToArray (typeof (char));
1097 void ParseJISOrder (string filename)
1099 using (StreamReader file =
1100 new StreamReader (filename)) {
1101 while (file.Peek () >= 0) {
1102 string s = file.ReadLine ();
1103 int idx = s.IndexOf ('#');
1105 s = s.Substring (0, idx).Trim ();
1108 idx = s.IndexOf (' ');
1111 // They start with "0x" so cut them out.
1112 int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
1113 int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
1114 jisJapanese.Add (new JISCharacter (cp, jis));
1119 void ParseCJK (string zhXML, string jaXML, string koXML)
1121 XmlDocument doc = new XmlDocument ();
1122 doc.XmlResolver = null;
1129 // Chinese Simplified
1132 offset = char.MaxValue - arr.Length;
1134 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1136 foreach (char c in s) {
1138 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1140 arr [(int) c - offset] = (ushort) v++;
1146 // Chinese Traditional
1149 offset = char.MaxValue - arr.Length;
1150 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1152 foreach (char c in s) {
1154 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1156 arr [(int) c - offset] = (ushort) v++;
1165 offset = char.MaxValue - arr.Length;
1167 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1169 foreach (char c in s) {
1171 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1173 arr [(int) c - offset] = (ushort) v++;
1180 // Korean weight is somewhat complex. It first shifts
1181 // Hangul category from 52-x to 80-x (they are anyways
1182 // computed). CJK ideographs are placed at secondary
1183 // weight, like XX YY 01 zz 01, where XX and YY are
1184 // corresponding "reset" value and zz is 41,43,45...
1186 // Unlike chs,cht and ja, Korean value is a combined
1187 // ushort which is computed as category
1191 offset = char.MaxValue - arr.Length;
1193 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1194 XmlElement sc = (XmlElement) reset.NextSibling;
1195 // compute "category" and "level 1" for the
1196 // target "reset" Hangle syllable
1197 char rc = reset.InnerText [0];
1198 int ri = ((int) rc - 0xAC00) + 1;
1200 ((ri / 254) * 256 + (ri % 254) + 2);
1201 // Place the characters after the target.
1204 foreach (char c in s) {
1205 arr [(int) c - offset] = p;
1206 cjkKOlv2 [(int) c - offset] = (byte) v;
1216 void FillIgnorables ()
1218 for (int i = 0; i <= char.MaxValue; i++) {
1219 if (Char.GetUnicodeCategory ((char) i) ==
1220 UnicodeCategory.OtherNotAssigned)
1222 if (IsIgnorable (i))
1223 ignorableFlags [i] |= 1;
1224 if (IsIgnorableSymbol (i))
1225 ignorableFlags [i] |= 2;
1226 if (IsIgnorableNonSpacing (i))
1227 ignorableFlags [i] |= 4;
1231 void ModifyParsedValues ()
1233 // number, secondary weights
1235 int [] numarr = numberSecondaryWeightBounds;
1236 for (int i = 0; i < numarr.Length; i += 2, weight++)
1237 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1238 if (Char.IsNumber ((char) cp))
1239 diacritical [cp] = weight;
1241 // Korean parens numbers
1242 for (int i = 0x3200; i <= 0x321C; i++)
1243 diacritical [i] = 0xA;
1244 for (int i = 0x3260; i <= 0x327B; i++)
1245 diacritical [i] = 0xC;
1247 // Update name part of named characters
1248 for (int i = 0; i < sortableCharNames.Count; i++) {
1249 DictionaryEntry de =
1250 (DictionaryEntry) sortableCharNames [i];
1251 int cp = (int) de.Key;
1252 string renamed = null;
1254 case 0x2101: renamed = "A_1"; break;
1255 case 0x33C3: renamed = "A_2"; break;
1256 case 0x2105: renamed = "C_1"; break;
1257 case 0x2106: renamed = "C_2"; break;
1258 case 0x211E: renamed = "R1"; break;
1259 case 0x211F: renamed = "R2"; break;
1260 // Remove some of them!
1271 sortableCharNames.RemoveAt (i);
1275 if (renamed != null)
1276 sortableCharNames [i] =
1277 new DictionaryEntry (cp, renamed);
1281 void GenerateCore ()
1285 #region Specially ignored // 01
1286 // This will raise "Defined" flag up.
1287 foreach (char c in specialIgnore)
1288 map [(int) c] = new CharMapEntry (0, 0, 0);
1292 #region Variable weights
1293 // Controls : 06 03 - 06 3D
1295 for (int i = 0; i < 65536; i++) {
1296 if (IsIgnorable (i))
1299 uc = Char.GetUnicodeCategory (c);
1300 // NEL is whitespace but not ignored here.
1301 if (uc == UnicodeCategory.Control &&
1302 !Char.IsWhiteSpace (c) || c == '\u0085')
1303 AddCharMap (c, 6, 1);
1307 fillIndex [6] = 0x80;
1308 AddCharMapGroup ('\'', 6, 1, 0);
1309 AddCharMap ('\uFE63', 6, 1);
1311 // Hyphen/Dash : 06 81 - 06 90
1312 for (int i = 0; i < char.MaxValue; i++) {
1313 if (Char.GetUnicodeCategory ((char) i)
1314 == UnicodeCategory.DashPunctuation)
1315 AddCharMapGroupTail ((char) i, 6, 1);
1318 // Arabic variable weight chars 06 A0 -
1319 fillIndex [6] = 0xA0;
1321 for (int i = 0x64B; i <= 0x650; i++)
1322 AddCharMapGroupTail ((char) i, 6, 1);
1324 AddCharMapGroup ('\u0652', 6, 1, 0);
1326 AddCharMapGroup ('\u0651', 6, 1, 0);
1330 #region Nonspacing marks // 01
1331 // FIXME: 01 03 - 01 B6 ... annoyance :(
1333 // Combining diacritical marks: 01 DC -
1335 fillIndex [0x1] = 0x41;
1336 for (int i = 0x030E; i <= 0x0326; i++)
1337 if (!IsIgnorable (i))
1338 AddCharMap ((char) i, 0x1, 1);
1339 for (int i = 0x0329; i <= 0x0334; i++)
1340 if (!IsIgnorable (i))
1341 AddCharMap ((char) i, 0x1, 1);
1342 for (int i = 0x0339; i <= 0x0341; i++)
1343 if (!IsIgnorable (i))
1344 AddCharMap ((char) i, 0x1, 1);
1345 fillIndex [0x1] = 0x72;
1346 for (int i = 0x0346; i <= 0x0348; i++)
1347 if (!IsIgnorable (i))
1348 AddCharMap ((char) i, 0x1, 1);
1349 for (int i = 0x02BE; i <= 0x02BF; i++)
1350 if (!IsIgnorable (i))
1351 AddCharMap ((char) i, 0x1, 1);
1352 for (int i = 0x02C1; i <= 0x02C5; i++)
1353 if (!IsIgnorable (i))
1354 AddCharMap ((char) i, 0x1, 1);
1355 for (int i = 0x02CE; i <= 0x02CF; i++)
1356 if (!IsIgnorable (i))
1357 AddCharMap ((char) i, 0x1, 1);
1358 for (int i = 0x02D1; i <= 0x02D3; i++)
1359 if (!IsIgnorable (i))
1360 AddCharMap ((char) i, 0x1, 1);
1361 AddCharMap ('\u02DE', 0x1, 1);
1362 for (int i = 0x02E4; i <= 0x02E9; i++)
1363 if (!IsIgnorable (i))
1364 AddCharMap ((char) i, 0x1, 1);
1366 // LAMESPEC: It should not stop at '\u20E1'. There are
1367 // a few more characters (that however results in
1368 // overflow of level 2 unless we start before 0xDD).
1369 fillIndex [0x1] = 0xDC;
1370 for (int i = 0x20d0; i <= 0x20e1; i++)
1371 AddCharMap ((char) i, 0x1, 1);
1375 #region Whitespaces // 07 03 -
1376 fillIndex [0x7] = 0x2;
1377 AddCharMap (' ', 0x7, 2);
1378 AddCharMap ('\u00A0', 0x7, 1);
1379 for (int i = 9; i <= 0xD; i++)
1380 AddCharMap ((char) i, 0x7, 1);
1381 for (int i = 0x2000; i <= 0x200B; i++)
1382 AddCharMap ((char) i, 0x7, 1);
1384 fillIndex [0x7] = 0x17;
1385 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1386 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1388 // Characters which used to represent layout control.
1389 // LAMESPEC: Windows developers seem to have thought
1390 // that those characters are kind of whitespaces,
1391 // while they aren't.
1392 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1393 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1396 // FIXME: 09 should be more complete.
1397 fillIndex [0x9] = 2;
1399 for (int cp = 0x2300; cp <= 0x237A; cp++)
1400 AddCharMap ((char) cp, 0x9, 1, 0);
1403 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1404 foreach (DictionaryEntry de in arrowValues) {
1405 int idx = (int) de.Value;
1406 int cp = (int) de.Key;
1407 if (map [cp].Defined)
1409 fillIndex [0x9] = (byte) (0xD8 + idx);
1410 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1414 byte [] boxLv2 = new byte [128];
1415 for (int i = 0; i < boxLv2.Length; i++)
1417 foreach (DictionaryEntry de in boxValues) {
1418 int cp = (int) de.Key;
1419 int idx = (int) de.Value;
1420 if (map [cp].Defined)
1422 fillIndex [0x9] = (byte) (0xE5 + idx);
1423 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1426 // Some special characters (slanted)
1427 fillIndex [0x9] = 0xF4;
1428 AddCharMap ('\u2571', 0x9, 3);
1429 AddCharMap ('\u2572', 0x9, 3);
1430 AddCharMap ('\u2573', 0x9, 3);
1432 // FIXME: implement 0A
1434 fillIndex [0xA] = 2;
1435 // byte currency symbols
1436 for (int cp = 0; cp < 0x100; cp++) {
1437 uc = Char.GetUnicodeCategory ((char) cp);
1438 if (!IsIgnorable (cp) &&
1439 uc == UnicodeCategory.CurrencySymbol &&
1441 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1443 // byte other symbols
1444 for (int cp = 0; cp < 0x100; cp++) {
1446 continue; // SPECIAL: skip FIXME: why?
1447 uc = Char.GetUnicodeCategory ((char) cp);
1448 if (!IsIgnorable (cp) &&
1449 uc == UnicodeCategory.OtherSymbol)
1450 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1453 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1454 for (int cp = 0x2600; cp <= 0x2613; cp++)
1455 AddCharMap ((char) cp, 0xA, 1, 0);
1457 for (int cp = 0x2620; cp <= 0x2770; cp++)
1458 if (Char.IsSymbol ((char) cp))
1459 AddCharMap ((char) cp, 0xA, 1, 0);
1461 for (int i = 0x2440; i < 0x2460; i++)
1462 AddCharMap ((char) i, 0xA, 1, 0);
1466 #region Numbers // 0C 02 - 0C E1
1467 fillIndex [0xC] = 2;
1469 // 9F8 : Bengali "one less than the denominator"
1470 AddCharMap ('\u09F8', 0xC, 1);
1472 ArrayList numbers = new ArrayList ();
1473 for (int i = 0; i < 65536; i++)
1474 if (!IsIgnorable (i) &&
1475 Char.IsNumber ((char) i) &&
1476 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1479 ArrayList numberValues = new ArrayList ();
1480 foreach (int i in numbers)
1481 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1482 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1484 //foreach (DictionaryEntry de in numberValues)
1485 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1487 decimal prevValue = -1;
1488 foreach (DictionaryEntry de in numberValues) {
1489 int cp = (int) de.Key;
1490 decimal currValue = (decimal) de.Value;
1491 bool addnew = false;
1492 if (prevValue < currValue &&
1493 prevValue - (int) prevValue == 0 &&
1497 // Process Hangzhou and Roman numbers
1499 // There are some SPECIAL cases.
1500 if (currValue != 4) // no increment for 4
1504 xcp = (int) prevValue + 0x2170 - 1;
1505 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1506 xcp = (int) prevValue + 0x2160 - 1;
1507 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1508 fillIndex [0xC] += 2;
1509 xcp = (int) prevValue + 0x3021 - 1;
1510 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1513 if (prevValue < currValue)
1514 prevValue = currValue;
1515 if (map [cp].Defined)
1517 // HangZhou and Roman are add later
1519 else if (0x3021 <= cp && cp < 0x302A
1520 || 0x2160 <= cp && cp < 0x216A
1521 || 0x2170 <= cp && cp < 0x217A)
1524 if (cp == 0x215B) // FIXME: why?
1525 fillIndex [0xC] += 2;
1526 else if (cp == 0x3021) // FIXME: why?
1528 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1530 if (addnew || cp <= '9') {
1532 if (1 <= currValue && currValue <= 10) {
1533 xcp = cp - 0x31 + 0x2776;
1534 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1535 xcp = cp - 0x31 + 0x2780;
1536 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1537 xcp = cp - 0x31 + 0x278A;
1538 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1540 if (1 <= currValue && currValue <= 20) {
1541 xcp = cp - 0x31 + 0x2460;
1542 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1543 xcp = cp - 0x31 + 0x2474;
1544 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1545 xcp = cp - 0x31 + 0x2488;
1546 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1550 if (cp != 0x09E7 && cp != 0x09EA)
1553 // Add special cases that are not regarded as
1554 // numbers in UnicodeCategory speak.
1557 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1558 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1560 else if (cp == '6') // FIXME: why?
1565 fillIndex [0xC] = 0xFF;
1566 AddCharMap ('\u221E', 0xC, 1);
1569 #region Letters and NonSpacing Marks (general)
1571 // ASCII Latin alphabets
1572 for (int i = 0; i < alphabets.Length; i++)
1573 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1576 // non-ASCII Latin alphabets
1577 // FIXME: there is no such characters that are placed
1578 // *after* "alphabets" array items. This is nothing
1579 // more than a hack that creates dummy weight for
1580 // primary characters.
1581 for (int i = 0x0080; i < 0x0300; i++) {
1582 if (!Char.IsLetter ((char) i))
1584 // For those Latin Letters which has NFKD are
1585 // not added as independent primary character.
1586 if (decompIndex [i] != 0)
1589 // 1.some alphabets have primarily
1590 // equivalent ASCII alphabets.
1591 // 2.some have independent primary weights,
1592 // but inside a-to-z range.
1593 // 3.there are some expanded characters that
1594 // are not part of Unicode Standard NFKD.
1596 // 1. skipping them does not make sense
1597 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1598 // case 0x184: case 0x185: case 0x186: case 0x189:
1599 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1600 // case 0x194: case 0x195: case 0x196: case 0x19A:
1601 // case 0x19B: case 0x19C:
1602 // 2. skipping them does not make sense
1603 // case 0x14A: // Ng
1604 // case 0x14B: // ng
1608 case 0xDE: // Icelandic Thorn
1609 case 0xFE: // Icelandic Thorn
1610 case 0xDF: // German ss
1611 case 0xFF: // German ss
1612 // not classified yet
1613 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1614 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1615 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1616 // case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1620 AddCharMapGroup ((char) i, 0xE, 1, 0);
1624 fillIndex [0xF] = 02;
1625 for (int i = 0x0380; i < 0x0390; i++)
1626 if (Char.IsLetter ((char) i))
1627 AddLetterMap ((char) i, 0xF, 1);
1628 fillIndex [0xF] = 02;
1629 for (int i = 0x0391; i < 0x03CF; i++)
1630 if (Char.IsLetter ((char) i))
1631 AddLetterMap ((char) i, 0xF, 1);
1632 fillIndex [0xF] = 0x40;
1633 for (int i = 0x03D0; i < 0x0400; i++)
1634 if (Char.IsLetter ((char) i))
1635 AddLetterMap ((char) i, 0xF, 1);
1637 // Cyrillic - UCA order w/ some modification
1638 fillIndex [0x10] = 0x3;
1639 // table which is moslty from UCA DUCET.
1640 for (int i = 0; i < orderedCyrillic.Length; i++) {
1641 char c = orderedCyrillic [i];
1642 if (Char.IsLetter (c))
1643 AddLetterMap (c, 0x10, 3);
1645 for (int i = 0x0460; i < 0x0481; i++) {
1646 if (Char.IsLetter ((char) i))
1647 AddLetterMap ((char) i, 0x10, 3);
1651 fillIndex [0x11] = 0x3;
1652 for (int i = 0x0531; i < 0x0586; i++)
1653 if (Char.IsLetter ((char) i))
1654 AddLetterMap ((char) i, 0x11, 1);
1658 fillIndex [0x12] = 0x3;
1659 for (int i = 0x05D0; i < 0x05FF; i++)
1660 if (Char.IsLetter ((char) i))
1661 AddLetterMap ((char) i, 0x12, 1);
1663 fillIndex [0x1] = 0x3;
1664 for (int i = 0x0591; i <= 0x05C2; i++)
1666 AddCharMap ((char) i, 0x1, 1);
1669 fillIndex [0x1] = 0x8E;
1670 fillIndex [0x13] = 0x3;
1671 for (int i = 0x0621; i <= 0x064A; i++) {
1673 if (Char.GetUnicodeCategory ((char) i)
1674 != UnicodeCategory.OtherLetter) {
1675 // FIXME: arabic nonspacing marks are
1676 // in different order.
1677 AddCharMap ((char) i, 0x1, 1);
1680 // map [i] = new CharMapEntry (0x13,
1681 // (byte) arabicLetterPrimaryValues [i], 1);
1683 (byte) arabicLetterPrimaryValues [i];
1684 AddLetterMap ((char) i, 0x13, 0);
1686 fillIndex [0x13] = 0x84;
1687 for (int i = 0x0674; i < 0x06D6; i++)
1688 if (Char.IsLetter ((char) i))
1689 AddLetterMap ((char) i, 0x13, 1);
1692 // FIXME: it does seem straight codepoint mapping.
1693 fillIndex [0x14] = 04;
1694 for (int i = 0x0901; i < 0x0905; i++)
1695 if (!IsIgnorable (i))
1696 AddLetterMap ((char) i, 0x14, 2);
1697 fillIndex [0x14] = 0xB;
1698 for (int i = 0x0905; i < 0x093A; i++)
1699 if (Char.IsLetter ((char) i))
1700 AddLetterMap ((char) i, 0x14, 4);
1701 for (int i = 0x093E; i < 0x094F; i++)
1702 if (!IsIgnorable (i))
1703 AddLetterMap ((char) i, 0x14, 2);
1707 fillIndex [0x15] = 02;
1708 for (int i = 0x0980; i < 0x9FF; i++) {
1709 if (IsIgnorable (i))
1712 fillIndex [0x15] = 0x3B;
1713 switch (Char.GetUnicodeCategory ((char) i)) {
1714 case UnicodeCategory.NonSpacingMark:
1715 case UnicodeCategory.DecimalDigitNumber:
1716 case UnicodeCategory.OtherNumber:
1719 AddLetterMap ((char) i, 0x15, 1);
1722 fillIndex [0x1] = 0x3;
1723 for (int i = 0x0981; i < 0x0A00; i++)
1724 if (Char.GetUnicodeCategory ((char) i) ==
1725 UnicodeCategory.NonSpacingMark)
1726 AddCharMap ((char) i, 0x1, 1);
1728 // Gurmukhi. orderedGurmukhi is from UCA
1729 // FIXME: it does not look equivalent to UCA.
1730 fillIndex [0x1] = 03;
1731 fillIndex [0x16] = 02;
1732 for (int i = 0; i < orderedGurmukhi.Length; i++) {
1733 char c = orderedGurmukhi [i];
1734 if (IsIgnorable ((int) c))
1736 if (!Char.IsLetter (c)) {
1737 AddLetterMap (c, 0x1, 1);
1740 if (c == '\u0A3C' || c == '\u0A4D' ||
1741 '\u0A66' <= c && c <= '\u0A71')
1743 AddLetterMap (c, 0x16, 4);
1746 // Gujarati. orderedGujarati is from UCA
1747 fillIndex [0x17] = 02;
1748 for (int i = 0; i < orderedGujarati.Length; i++)
1749 AddLetterMap (orderedGujarati [i], 0x17, 4);
1752 fillIndex [0x18] = 02;
1753 for (int i = 0x0B00; i < 0x0B7F; i++) {
1754 switch (Char.GetUnicodeCategory ((char) i)) {
1755 case UnicodeCategory.NonSpacingMark:
1756 case UnicodeCategory.DecimalDigitNumber:
1759 AddLetterMap ((char) i, 0x18, 1);
1763 fillIndex [0x19] = 2;
1764 AddCharMap ('\u0BD7', 0x19, 0);
1765 fillIndex [0x19] = 0xA;
1767 for (int i = 0x0BD7; i < 0x0B94; i++)
1768 if (Char.IsLetter ((char) i))
1769 AddCharMap ((char) i, 0x19, 2);
1771 fillIndex [0x19] = 0x24;
1772 AddCharMap ('\u0B94', 0x19, 0);
1773 fillIndex [0x19] = 0x26;
1774 // The array for Tamil consonants is a constant.
1775 // Windows have almost similar sequence to TAM from
1776 // tamilnet but a bit different in Grantha.
1777 for (int i = 0; i < orderedTamilConsonants.Length; i++)
1778 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
1780 fillIndex [0x19] = 0x82;
1781 for (int i = 0x0BBE; i < 0x0BCD; i++)
1782 if (Char.GetUnicodeCategory ((char) i) ==
1783 UnicodeCategory.SpacingCombiningMark
1785 AddLetterMap ((char) i, 0x19, 2);
1788 fillIndex [0x1A] = 0x4;
1789 for (int i = 0x0C00; i < 0x0C62; i++) {
1790 if (i == 0x0C55 || i == 0x0C56)
1792 AddCharMap ((char) i, 0x1A, 3);
1793 char supp = (i == 0x0C0B) ? '\u0C60':
1794 i == 0x0C0C ? '\u0C61' : char.MinValue;
1795 if (supp == char.MinValue)
1797 AddCharMap (supp, 0x1A, 3);
1801 fillIndex [0x1B] = 4;
1802 for (int i = 0x0C80; i < 0x0CE5; i++) {
1803 if (i == 0x0CD5 || i == 0x0CD6)
1805 AddCharMap ((char) i, 0x1B, 3);
1809 fillIndex [0x1C] = 2;
1810 for (int i = 0x0D02; i < 0x0D61; i++)
1811 // FIXME: I avoided MSCompatUnicodeTable usage
1812 // here (it results in recursion). So check if
1813 // using NonSpacingMark makes sense or not.
1814 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
1815 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
1816 AddCharMap ((char) i, 0x1C, 1);
1818 // Thai ... note that it breaks 0x1E wall after E2B!
1819 // Also, all Thai characters have level 2 value 3.
1820 fillIndex [0x1E] = 2;
1821 for (int i = 0xE44; i < 0xE48; i++)
1822 AddCharMap ((char) i, 0x1E, 1, 3);
1823 for (int i = 0xE01; i < 0xE2B; i++)
1824 AddCharMap ((char) i, 0x1E, 6, 0);
1825 fillIndex [0x1F] = 5;
1826 for (int i = 0xE2B; i < 0xE30; i++)
1827 AddCharMap ((char) i, 0x1F, 6, 0);
1828 for (int i = 0xE30; i < 0xE3B; i++)
1829 AddCharMap ((char) i, 0x1F, 1, 3);
1830 // some Thai characters remains.
1831 char [] specialThai = new char [] {'\u0E45', '\u0E46',
1832 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
1833 foreach (char c in specialThai)
1834 AddCharMap (c, 0x1F, 1);
1837 fillIndex [0x1F] = 2;
1838 for (int i = 0xE80; i < 0xEDF; i++)
1839 if (Char.IsLetter ((char) i))
1840 AddCharMap ((char) i, 0x1F, 1);
1842 // Georgian. orderedGeorgian is from UCA DUCET.
1843 fillIndex [0x21] = 5;
1844 for (int i = 0; i < orderedGeorgian.Length; i++)
1845 AddLetterMap (orderedGeorgian [i], 0x21, 5);
1848 fillIndex [0x22] = 2;
1849 int kanaOffset = 0x3041;
1850 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
1852 for (int gyo = 0; gyo < 9; gyo++) {
1853 for (int dan = 0; dan < 5; dan++) {
1854 if (gyo == 7 && dan % 2 == 1) {
1857 kanaOffset -= 2; // There is no space for yi and ye.
1860 int cp = kanaOffset + dan * kanaLines [gyo];
1861 // small lines (a-gyo, ya-gyo)
1862 if (gyo == 0 || gyo == 7) {
1863 AddKanaMap (cp, 1); // small
1864 AddKanaMap (cp + 1, 1);
1867 AddKanaMap (cp, kanaLines [gyo]);
1871 // add small 'Tsu' (before normal one)
1872 AddKanaMap (0x3063, 1);
1876 fillIndex [0x22] += 3;
1877 kanaOffset += 5 * kanaLines [gyo];
1880 // Wa-gyo is almost special, so I just manually add.
1881 AddLetterMap ((char) 0x308E, 0x22, 0);
1882 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
1883 AddLetterMap ((char) 0x308F, 0x22, 0);
1884 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
1886 AddLetterMap ((char) 0x3090, 0x22, 0);
1887 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
1888 fillIndex [0x22] += 2;
1889 // no "Wu" in Japanese.
1890 AddLetterMap ((char) 0x3091, 0x22, 0);
1891 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
1893 AddLetterMap ((char) 0x3092, 0x22, 0);
1894 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
1896 fillIndex [0x22] = 0x80;
1897 AddLetterMap ((char) 0x3093, 0x22, 0);
1898 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
1900 // JIS Japanese square chars.
1901 fillIndex [0x22] = 0x97;
1902 jisJapanese.Sort (JISComparer.Instance);
1903 foreach (JISCharacter j in jisJapanese)
1904 AddCharMap ((char) j.CP, 0x22, 1);
1905 // non-JIS Japanese square chars.
1906 nonJisJapanese.Sort (NonJISComparer.Instance);
1907 foreach (NonJISCharacter j in nonJisJapanese)
1908 AddCharMap ((char) j.CP, 0x22, 1);
1911 fillIndex [0x23] = 0x02;
1912 for (int i = 0x3105; i <= 0x312C; i++)
1913 AddCharMap ((char) i, 0x23, 1);
1915 // Estrangela: ancient Syriac
1916 fillIndex [0x24] = 0x0B;
1917 // FIXME: is 0x71E really alternative form?
1918 ArrayList syriacAlternatives = new ArrayList (
1919 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
1920 for (int i = 0x0710; i <= 0x072C; i++) {
1921 if (i == 0x0711) // NonSpacingMark
1923 if (syriacAlternatives.Contains (i))
1925 AddCharMap ((char) i, 0x24, 4);
1930 foreach (int cp in syriacAlternatives)
1931 map [cp] = new CharMapEntry (0x24,
1932 (byte) (map [cp - 1].Level1 + 2),
1936 // FIXME: it turned out that it does not look like UCA
1937 fillIndex [0x24] = 0x6E;
1938 for (int i = 0; i < orderedThaana.Length; i++) {
1939 if (IsIgnorableNonSpacing (i))
1941 AddCharMap (orderedThaana [i], 0x24, 2);
1945 // FIXME: Add more culture-specific letters (that are
1946 // not supported in Windows collation) here.
1948 // Surrogate ... they are computed.
1953 // Unlike UCA Windows Hangul sequence mixes Jongseong
1954 // with Choseong sequence as well as Jungseong,
1955 // adjusted to have the same primary weight for the
1956 // same base character. So it is impossible to compute
1959 // Here I introduce an ordered sequence of mixed
1960 // 'commands' and 'characters' that is similar to
1962 // - ',' increases primary weight.
1963 // - [A B] means a range, increasing index
1964 // - {A B} means a range, without increasing index
1965 // - '=' is no operation (it means the characters
1966 // of both sides have the same weight).
1967 // - '>' inserts a Hangul Syllable block that
1968 // contains 0x251 characters.
1969 // - '<' decreases the index
1970 // - '0'-'9' means skip count
1971 // - whitespaces are ignored
1974 string hangulSequence =
1975 + "\u1100=\u11A8 > \u1101=\u11A9 >"
1976 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
1977 + "<{\u1113 \u1116}, \u3165,"
1978 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
1979 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
1980 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
1981 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
1982 + "[\u11D1 \u11D2], \u11B2,"
1983 + "[\u11D3 \u11D5], \u11B3,"
1984 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
1985 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
1986 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
1987 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
1988 + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
1989 + "\u1109=\u11BA,,, \u3214=\u3274 <>"
1990 + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
1991 + "\u11EA,, \u110A=\u11BB,,, >"
1992 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
1993 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
1994 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
1995 + "\u11F1,, \u11F2,,,"
1996 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
1997 + "<\u114D, \u110D,, >"
1998 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
1999 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2000 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2001 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2002 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2006 byte hangulCat = 0x52;
2007 fillIndex [hangulCat] = 0x2;
2009 int syllableBlock = 0;
2010 for (int n = 0; n < hangulSequence.Length; n++) {
2011 char c = hangulSequence [n];
2013 if (Char.IsWhiteSpace (c))
2019 IncrementSequentialIndex (ref hangulCat);
2022 if (fillIndex [hangulCat] == 2)
2023 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2024 fillIndex [hangulCat]--;
2027 IncrementSequentialIndex (ref hangulCat);
2028 for (int l = 0; l < 0x15; l++)
2029 for (int v = 0; v < 0x1C; v++) {
2031 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2032 IncrementSequentialIndex (ref hangulCat);
2037 start = hangulSequence [n + 1];
2038 end = hangulSequence [n + 3];
2039 for (int i = start; i <= end; i++) {
2040 AddCharMap ((char) i, hangulCat, 0);
2042 IncrementSequentialIndex (ref hangulCat);
2044 n += 4; // consumes 5 characters for this operation
2047 start = hangulSequence [n + 1];
2048 end = hangulSequence [n + 3];
2049 for (int i = start; i <= end; i++)
2050 AddCharMap ((char) i, hangulCat, 0);
2051 n += 4; // consumes 5 characters for this operation
2054 AddCharMap (c, hangulCat, 0);
2061 // Letterlike characters and CJK compatibility square
2062 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2063 int [] counts = new int ['Z' - 'A' + 1];
2064 char [] namedChars = new char [sortableCharNames.Count];
2066 foreach (DictionaryEntry de in sortableCharNames) {
2067 counts [((string) de.Value) [0] - 'A']++;
2068 namedChars [nCharNames++] = (char) ((int) de.Key);
2070 nCharNames = 0; // reset
2071 for (int a = 0; a < counts.Length; a++) {
2072 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2073 for (int i = 0; i < counts [a]; i++)
2074 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2075 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2078 // CJK unified ideograph.
2080 fillIndex [cjkCat] = 0x2;
2081 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2082 if (!IsIgnorable (cp))
2083 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2084 // CJK Extensions goes here.
2085 // LAMESPEC: With this Windows style CJK layout, it is
2086 // impossible to add more CJK ideograph i.e. 0x9FA6-
2087 // 0x9FBB can never be added w/o breaking compat.
2088 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2089 if (!IsIgnorable (cp))
2090 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2092 // PrivateUse ... computed.
2093 // remaining Surrogate ... computed.
2095 #region Special "biggest" area (FF FF)
2096 fillIndex [0xFF] = 0xFF;
2097 char [] specialBiggest = new char [] {
2098 '\u3005', '\u3031', '\u3032', '\u309D',
2099 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2100 '\uFE7C', '\uFE7D', '\uFF70'};
2101 foreach (char c in specialBiggest)
2102 AddCharMap (c, 0xFF, 0);
2105 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2106 // non-alphanumeric ASCII except for: + - < = > '
2107 for (int i = 0x21; i < 0x7F; i++) {
2108 if (Char.IsLetterOrDigit ((char) i)
2109 || "+-<=>'".IndexOf ((char) i) >= 0)
2110 continue; // they are not added here.
2111 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2112 // Insert 3001 after ',' and 3002 after '.'
2114 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2115 else if (i == 0x2E) {
2117 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2120 AddCharMap ('\uFE30', 0x7, 1, 0);
2124 #region 07 - Punctuations and something else
2125 for (int i = 0xA0; i < char.MaxValue; i++) {
2126 if (IsIgnorable (i))
2138 switch (Char.GetUnicodeCategory ((char) i)) {
2139 case UnicodeCategory.OtherPunctuation:
2140 case UnicodeCategory.ClosePunctuation:
2141 case UnicodeCategory.OpenPunctuation:
2142 case UnicodeCategory.InitialQuotePunctuation:
2143 case UnicodeCategory.FinalQuotePunctuation:
2144 case UnicodeCategory.ModifierSymbol:
2145 // SPECIAL CASES: // 0xA
2146 if (0x2020 <= i && i <= 0x2042)
2148 AddCharMapGroup ((char) i, 0x7, 1, 0);
2151 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2152 goto case UnicodeCategory.OtherPunctuation;
2157 for (int i = 0x2400; i <= 0x2421; i++)
2158 AddCharMap ((char) i, 0x7, 1, 0);
2161 // FIXME: for 07 xx we need more love.
2163 // FIXME: 08 should be more complete.
2164 fillIndex [0x8] = 2;
2165 for (int cp = 0; cp < char.MaxValue; cp++)
2166 if (!map [cp].Defined &&
2167 Char.GetUnicodeCategory ((char) cp) ==
2168 UnicodeCategory.MathSymbol)
2169 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2171 // Characters w/ diacritical marks (NFKD)
2172 for (int i = 0; i <= char.MaxValue; i++) {
2173 if (map [i].Defined || IsIgnorable (i))
2175 if (decompIndex [i] == 0)
2178 int start = decompIndex [i];
2179 int primaryChar = decompValues [start];
2182 int length = decompLength [i];
2183 // special processing for parenthesized ones.
2185 decompValues [start] == '(' &&
2186 decompValues [start + 2] == ')') {
2187 primaryChar = decompValues [start + 1];
2191 if (map [primaryChar].Level1 == 0)
2194 for (int l = 1; l < length; l++) {
2195 int c = decompValues [start + l];
2196 if (map [c].Level1 != 0)
2198 secondary += diacritical [c];
2202 map [i] = new CharMapEntry (
2203 map [primaryChar].Category,
2204 map [primaryChar].Level1,
2209 #region Level2 adjustment
2211 diacritical [0x624] = 0x5;
2212 diacritical [0x626] = 0x7;
2213 diacritical [0x622] = 0x9;
2214 diacritical [0x623] = 0xA;
2215 diacritical [0x625] = 0xB;
2216 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2217 diacritical [0x64A] = 0x7; // Yaa'
2220 for (int i = 0; i < char.MaxValue; i++) {
2222 byte cat = map [i].Category;
2224 case 0xE: // Latin diacritics
2225 case 0x22: // Japanese: circled characters
2226 mod = diacritical [i];
2228 case 0x13: // Arabic
2229 if (diacritical [i] == 0)
2230 mod = 0x8; // default for arabic
2233 if (0x52 <= cat && cat <= 0x7F) // Hangul
2234 mod = diacritical [i];
2236 map [i] = new CharMapEntry (
2237 cat, map [i].Level1, mod);
2241 // FIXME: this is hack but those which are
2242 // NonSpacingMark characters and still undefined
2243 // are likely to be nonspacing.
2244 for (int i = 0; i < char.MaxValue; i++)
2245 if (!map [i].Defined &&
2247 Char.GetUnicodeCategory ((char) i) ==
2248 UnicodeCategory.NonSpacingMark)
2249 AddCharMap ((char) i, 1, 1);
2252 private void IncrementSequentialIndex (ref byte hangulCat)
2254 fillIndex [hangulCat]++;
2255 if (fillIndex [hangulCat] == 0) { // overflown
2257 fillIndex [hangulCat] = 0x2;
2261 // Reset fillIndex to fixed value and call AddLetterMap().
2262 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2264 fillIndex [category] = alphaWeight;
2265 AddLetterMap (c, category, 0);
2267 ArrayList al = latinMap [c] as ArrayList;
2271 foreach (int cp in al)
2272 AddLetterMap ((char) cp, category, 0);
2275 private void AddKanaMap (int i, byte voices)
2277 for (byte b = 0; b < voices; b++) {
2278 char c = (char) (i + b);
2279 byte arg = (byte) (b > 0 ? b + 2 : 0);
2281 AddLetterMapCore (c, 0x22, 0, arg);
2283 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2287 private void AddLetterMap (char c, byte category, byte updateCount)
2289 AddLetterMapCore (c, category, updateCount, 0);
2292 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2295 // <small> updates index
2296 c2 = ToSmallForm (c);
2298 AddCharMapGroup (c2, category, updateCount, level2);
2299 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2300 if (c2 != c && !map [(int) c2].Defined)
2301 AddLetterMapCore (c2, category, 0, level2);
2302 bool doUpdate = true;
2303 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2306 AddCharMapGroup (c, category, 0, level2);
2308 fillIndex [category] += updateCount;
2311 private bool AddCharMap (char c, byte category, byte increment)
2313 return AddCharMap (c, category, increment, 0);
2316 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2318 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2319 return false; // do nothing
2320 map [(int) c] = new CharMapEntry (category,
2321 category == 1 ? alt : fillIndex [category],
2322 category == 1 ? fillIndex [category] : alt);
2323 fillIndex [category] += increment;
2327 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2329 char c2 = ToSmallFormTail (c);
2331 AddCharMap (c2, category, updateCount, 0);
2333 AddCharMap (c, category, updateCount, 0);
2335 c2 = ToFullWidthTail (c);
2337 AddCharMapGroupTail (c2, category, updateCount);
2341 // Adds characters to table in the order below
2342 // (+ increases weight):
2346 // <full> | <super> | <sub>
2347 // <circle> | <wide> (| <narrow>)
2351 // level2 is fixed (does not increase).
2352 int [] sameWeightItems = new int [] {
2353 DecompositionFraction,
2357 DecompositionCircle,
2359 DecompositionNarrow,
2361 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2363 if (map [(int) c].Defined)
2366 char small = char.MinValue;
2367 char vertical = char.MinValue;
2368 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2370 object smv = nfkd [(byte) DecompositionSmall];
2372 small = (char) ((int) smv);
2373 object vv = nfkd [(byte) DecompositionVertical];
2375 vertical = (char) ((int) vv);
2378 // <small> updates index
2379 if (small != char.MinValue)
2380 AddCharMap (small, category, updateCount);
2383 AddCharMap (c, category, 0, level2);
2386 foreach (int weight in sameWeightItems) {
2387 object wv = nfkd [(byte) weight];
2389 AddCharMap ((char) ((int) wv), category, 0, level2);
2393 // update index here.
2394 fillIndex [category] += updateCount;
2396 if (vertical != char.MinValue)
2397 AddCharMap (vertical, category, updateCount, level2);
2400 private void AddCharMapCJK (char c, ref byte category)
2402 AddCharMap (c, category, 0, 0);
2403 IncrementSequentialIndex (ref category);
2405 // Special. I wonder why but Windows skips 9E F9.
2406 if (category == 0x9E && fillIndex [category] == 0xF9)
2407 IncrementSequentialIndex (ref category);
2410 private void AddCharMapGroupCJK (char c, ref byte category)
2412 AddCharMapCJK (c, ref category);
2414 // LAMESPEC: see below.
2415 if (c == '\u52DE') {
2416 AddCharMapCJK ('\u3298', ref category);
2417 AddCharMapCJK ('\u3238', ref category);
2420 AddCharMapCJK ('\u32A2', ref category);
2422 // Especially this mapping order totally does
2423 // not make sense to me.
2424 AddCharMapCJK ('\u32A9', ref category);
2426 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2429 for (byte weight = 0; weight <= 0x12; weight++) {
2430 object wv = nfkd [weight];
2435 // Special: they are ignored in this area.
2436 // FIXME: check if it is sane
2437 if (0xF900 <= w && w <= 0xFAD9)
2439 // LAMESPEC: on Windows some of CJK characters
2440 // in 3200-32B0 are incorrectly mapped. They
2441 // mix Chinise and Japanese Kanji when
2442 // ordering those characters.
2444 case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
2448 AddCharMapCJK ((char) w, ref category);
2452 // For now it is only for 0x7 category.
2453 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2455 char small = char.MinValue;
2456 char vertical = char.MinValue;
2457 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2459 object smv = nfkd [(byte) DecompositionSmall];
2461 small = (char) ((int) smv);
2462 object vv = nfkd [(byte) DecompositionVertical];
2464 vertical = (char) ((int) vv);
2467 // <small> updates index
2468 if (small != char.MinValue)
2469 // SPECIAL CASE excluded (FIXME: why?)
2470 if (small != '\u2024')
2471 AddCharMap (small, category, updateCount);
2474 AddCharMap (c, category, updateCount, level2);
2476 // Since nfkdMap is problematic to have two or more
2477 // NFKD to an identical character, here I iterate all.
2478 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2479 if (decompLength [c2] == 1 &&
2480 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2481 switch (decompType [c2]) {
2482 case DecompositionCompat:
2483 AddCharMap ((char) c2, category, updateCount, level2);
2489 if (vertical != char.MinValue)
2490 // SPECIAL CASE excluded (FIXME: why?)
2491 if (vertical != '\uFE33' && vertical != '\uFE34')
2492 AddCharMap (vertical, category, updateCount, level2);
2495 char ToFullWidth (char c)
2497 return ToDecomposed (c, DecompositionFull, false);
2500 char ToFullWidthTail (char c)
2502 return ToDecomposed (c, DecompositionFull, true);
2505 char ToSmallForm (char c)
2507 return ToDecomposed (c, DecompositionSmall, false);
2510 char ToSmallFormTail (char c)
2512 return ToDecomposed (c, DecompositionSmall, true);
2515 char ToDecomposed (char c, byte d, bool tail)
2517 if (decompType [(int) c] != d)
2519 int idx = decompIndex [(int) c];
2521 idx += decompLength [(int) c] - 1;
2522 return (char) decompValues [idx];
2525 bool ExistsJIS (int cp)
2527 foreach (JISCharacter j in jisJapanese)
2535 #region Level 3 properties (Case/Width)
2537 private byte ComputeLevel3Weight (char c)
2539 byte b = ComputeLevel3WeightRaw (c);
2540 return b > 0 ? (byte) (b + 2) : b;
2543 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
2546 if ('\u11A8' <= c && c <= '\u11F9')
2548 if ('\uFFA0' <= c && c <= '\uFFDC')
2550 if ('\u3130' <= c && c <= '\u3164')
2553 if ('\u2776' <= c && c <= '\u277F')
2555 if ('\u2780' <= c && c <= '\u2789')
2557 if ('\u2776' <= c && c <= '\u2793')
2559 if ('\u2160' <= c && c <= '\u216F')
2561 if ('\u2181' <= c && c <= '\u2182')
2564 if ('\u2135' <= c && c <= '\u2138')
2566 if ('\uFE80' <= c && c < '\uFE8E') {
2567 // 2(Isolated)/8(Final)/0x18(Medial)
2568 switch (decompType [(int) c]) {
2569 case DecompositionIsolated:
2571 case DecompositionFinal:
2573 case DecompositionMedial:
2578 // actually I dunno the reason why they have weights.
2601 switch (decompType [(int) c]) {
2602 case DecompositionWide: // <wide>
2603 case DecompositionSub: // <sub>
2604 case DecompositionSuper: // <super>
2605 ret |= decompType [(int) c];
2608 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
2610 if (isUppercase [(int) c]) // DerivedCoreProperties
2620 static bool IsIgnorable (int i)
2622 if (unicodeAge [i] >= 3.1)
2624 switch (char.GetUnicodeCategory ((char) i)) {
2625 case UnicodeCategory.OtherNotAssigned:
2626 case UnicodeCategory.Format:
2633 // FIXME: In the future use DerivedAge.txt to examine character
2634 // versions and set those ones that have higher version than
2635 // 1.0 as ignorable.
2636 static bool IsIgnorable (int i)
2640 // I guess, those characters are added between
2641 // Unicode 1.0 (LCMapString) and Unicode 3.1
2642 // (UnicodeCategory), so they used to be
2643 // something like OtherNotAssigned as of Unicode 1.1.
2644 case 0x2df: case 0x387:
2645 case 0x3d7: case 0x3d8: case 0x3d9:
2646 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
2647 case 0x400: case 0x40d: case 0x450: case 0x45d:
2648 case 0x587: case 0x58a: case 0x5c4: case 0x640:
2649 case 0x653: case 0x654: case 0x655: case 0x66d:
2651 case 0x1e9b: case 0x202f: case 0x20ad:
2652 case 0x20ae: case 0x20af:
2653 case 0x20e2: case 0x20e3:
2654 case 0x2139: case 0x213a: case 0x2183:
2655 case 0x2425: case 0x2426: case 0x2619:
2656 case 0x2670: case 0x2671: case 0x3007:
2657 case 0x3190: case 0x3191:
2658 case 0xfffc: case 0xfffd:
2660 // exceptional characters filtered by the
2661 // following conditions. Originally those exceptional
2662 // ranges are incorrect (they should not be ignored)
2663 // and most of those characters are unfortunately in
2665 case 0x4d8: case 0x4d9:
2666 case 0x4e8: case 0x4e9:
2667 case 0x3036: case 0x303f:
2668 case 0x337b: case 0xfb1e:
2673 // The whole Sinhala characters.
2674 0x0D82 <= i && i <= 0x0DF4
2675 // The whole Tibetan characters.
2676 || 0x0F00 <= i && i <= 0x0FD1
2677 // The whole Myanmar characters.
2678 || 0x1000 <= i && i <= 0x1059
2679 // The whole Etiopic, Cherokee,
2680 // Canadian Syllablic, Ogham, Runic,
2681 // Tagalog, Hanunoo, Philippine,
2682 // Buhid, Tagbanwa, Khmer and Mongorian
2684 || 0x1200 <= i && i <= 0x1DFF
2685 // Greek extension characters.
2686 || 0x1F00 <= i && i <= 0x1FFF
2687 // The whole Braille characters.
2688 || 0x2800 <= i && i <= 0x28FF
2689 // CJK radical characters.
2690 || 0x2E80 <= i && i <= 0x2EF3
2691 // Kangxi radical characters.
2692 || 0x2F00 <= i && i <= 0x2FD5
2693 // Ideographic description characters.
2694 || 0x2FF0 <= i && i <= 0x2FFB
2695 // Bopomofo letter and final
2696 || 0x31A0 <= i && i <= 0x31B7
2697 // White square with quadrant characters.
2698 || 0x25F0 <= i && i <= 0x25F7
2699 // Ideographic telegraph symbols.
2700 || 0x32C0 <= i && i <= 0x32CB
2701 || 0x3358 <= i && i <= 0x3370
2702 || 0x33E0 <= i && i <= 0x33FF
2703 // The whole YI characters.
2704 || 0xA000 <= i && i <= 0xA48C
2705 || 0xA490 <= i && i <= 0xA4C6
2706 // American small ligatures
2707 || 0xFB13 <= i && i <= 0xFB17
2708 // hebrew, arabic, variation selector.
2709 || 0xFB1D <= i && i <= 0xFE2F
2710 // Arabic ligatures.
2711 || 0xFEF5 <= i && i <= 0xFEFC
2712 // FIXME: why are they excluded?
2713 || 0x01F6 <= i && i <= 0x01F9
2714 || 0x0218 <= i && i <= 0x0233
2715 || 0x02A9 <= i && i <= 0x02AD
2716 || 0x02EA <= i && i <= 0x02EE
2717 || 0x0349 <= i && i <= 0x036F
2718 || 0x0488 <= i && i <= 0x048F
2719 || 0x04D0 <= i && i <= 0x04FF
2720 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
2721 || 0x06D6 <= i && i <= 0x06ED
2722 || 0x06FA <= i && i <= 0x06FE
2723 || 0x2048 <= i && i <= 0x204D
2724 || 0x20e4 <= i && i <= 0x20ea
2725 || 0x213C <= i && i <= 0x214B
2726 || 0x21EB <= i && i <= 0x21FF
2727 || 0x22F2 <= i && i <= 0x22FF
2728 || 0x237B <= i && i <= 0x239A
2729 || 0x239B <= i && i <= 0x23CF
2730 || 0x24EB <= i && i <= 0x24FF
2731 || 0x2596 <= i && i <= 0x259F
2732 || 0x25F8 <= i && i <= 0x25FF
2733 || 0x2672 <= i && i <= 0x2689
2734 || 0x2768 <= i && i <= 0x2775
2735 || 0x27d0 <= i && i <= 0x27ff
2736 || 0x2900 <= i && i <= 0x2aff
2737 || 0x3033 <= i && i <= 0x303F
2738 || 0x31F0 <= i && i <= 0x31FF
2739 || 0x3250 <= i && i <= 0x325F
2740 || 0x32B1 <= i && i <= 0x32BF
2741 || 0x3371 <= i && i <= 0x337B
2742 || 0xFA30 <= i && i <= 0xFA6A
2746 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2748 case UnicodeCategory.PrivateUse:
2749 case UnicodeCategory.Surrogate:
2751 // ignored by nature
2752 case UnicodeCategory.Format:
2753 case UnicodeCategory.OtherNotAssigned:
2760 // To check IsIgnorable sanity, try the driver below under MS.NET.
2763 public static void Main ()
2765 for (int i = 0; i <= char.MaxValue; i++)
2766 Dump (i, IsIgnorable (i));
2769 static void Dump (int i, bool ignore)
2771 switch (Char.GetUnicodeCategory ((char) i)) {
2772 case UnicodeCategory.PrivateUse:
2773 case UnicodeCategory.Surrogate:
2774 return; // check nothing
2778 string s2 = new string ((char) i, 10);
2779 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
2780 if ((ret == 0) == ignore)
2782 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
2785 #endregion // IsIgnorable
2787 #region IsIgnorableSymbol
2788 static bool IsIgnorableSymbol (int i)
2790 if (IsIgnorable (i))
2795 case 0x00b5: case 0x01C0: case 0x01C1:
2796 case 0x01C2: case 0x01C3: case 0x01F6:
2797 case 0x01F7: case 0x01F8: case 0x01F9:
2798 case 0x02D0: case 0x02EE: case 0x037A:
2799 case 0x03D7: case 0x03F3:
2800 case 0x0400: case 0x040d:
2801 case 0x0450: case 0x045d:
2802 case 0x048C: case 0x048D:
2803 case 0x048E: case 0x048F:
2804 case 0x0587: case 0x0640: case 0x06E5:
2805 case 0x06E6: case 0x06FA: case 0x06FB:
2806 case 0x06FC: case 0x093D: case 0x0950:
2807 case 0x1E9B: case 0x2139: case 0x3006:
2808 case 0x3033: case 0x3034: case 0x3035:
2809 case 0xFE7E: case 0xFE7F:
2811 case 0x16EE: case 0x16EF: case 0x16F0:
2813 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
2814 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
2815 case 0x3038: // HANGZHOU NUMERAL TEN
2816 case 0x3039: // HANGZHOU NUMERAL TWENTY
2817 case 0x303a: // HANGZHOU NUMERAL THIRTY
2823 case 0x02B9: case 0x02BA: case 0x02C2:
2824 case 0x02C3: case 0x02C4: case 0x02C5:
2825 case 0x02C8: case 0x02CC: case 0x02CD:
2826 case 0x02CE: case 0x02CF: case 0x02D2:
2827 case 0x02D3: case 0x02D4: case 0x02D5:
2828 case 0x02D6: case 0x02D7: case 0x02DE:
2829 case 0x02E5: case 0x02E6: case 0x02E7:
2830 case 0x02E8: case 0x02E9:
2831 case 0x309B: case 0x309C:
2833 case 0x055A: // American Apos
2834 case 0x05C0: // Hebrew Punct
2835 case 0x0E4F: // Thai FONGMAN
2836 case 0x0E5A: // Thai ANGKHANKHU
2837 case 0x0E5B: // Thai KHOMUT
2839 case 0x09F2: // Bengali Rupee Mark
2840 case 0x09F3: // Bengali Rupee Sign
2842 case 0x221e: // INF.
2851 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
2853 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
2854 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
2859 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2861 case UnicodeCategory.Surrogate:
2862 return false; // inconsistent
2864 case UnicodeCategory.SpacingCombiningMark:
2865 case UnicodeCategory.EnclosingMark:
2866 case UnicodeCategory.NonSpacingMark:
2867 case UnicodeCategory.PrivateUse:
2869 if (0x064B <= i && i <= 0x0652) // Arabic
2873 case UnicodeCategory.Format:
2874 case UnicodeCategory.OtherNotAssigned:
2881 // latin in a circle
2882 0x249A <= i && i <= 0x24E9
2883 || 0x2100 <= i && i <= 0x2132
2885 || 0x3196 <= i && i <= 0x31A0
2887 || 0x3200 <= i && i <= 0x321C
2889 || 0x322A <= i && i <= 0x3243
2891 || 0x3260 <= i && i <= 0x32B0
2892 || 0x32D0 <= i && i <= 0x3357
2893 || 0x337B <= i && i <= 0x33DD
2895 use = !Char.IsLetterOrDigit ((char) i);
2899 // This "Digit" rule is mystery.
2900 // It filters some symbols out.
2901 if (Char.IsLetterOrDigit ((char) i))
2903 if (Char.IsNumber ((char) i))
2905 if (Char.IsControl ((char) i)
2906 || Char.IsSeparator ((char) i)
2907 || Char.IsPunctuation ((char) i))
2909 if (Char.IsSymbol ((char) i))
2912 // FIXME: should check more
2917 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
2919 public static void Main ()
2921 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
2922 for (int i = 0; i <= char.MaxValue; i++) {
2923 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
2924 if (uc == UnicodeCategory.Surrogate)
2927 bool ret = IsIgnorableSymbol (i);
2929 string s1 = "TEST ";
2930 string s2 = "TEST " + (char) i;
2932 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
2934 if (ret != (result == 0))
2935 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
2936 ret ? "should not ignore" :
2945 static bool IsIgnorableNonSpacing (int i)
2947 if (IsIgnorable (i))
2951 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
2952 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
2953 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
2955 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
2956 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
2957 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
2958 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
2959 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
2960 case 0x0CCD: case 0x0E4E:
2964 if (0x02b9 <= i && i <= 0x02c5
2965 || 0x02cc <= i && i <= 0x02d7
2966 || 0x02e4 <= i && i <= 0x02ef
2967 || 0x20DD <= i && i <= 0x20E0
2971 if (0x064B <= i && i <= 0x00652
2972 || 0x0941 <= i && i <= 0x0948
2973 || 0x0AC1 <= i && i <= 0x0ACD
2974 || 0x0C3E <= i && i <= 0x0C4F
2975 || 0x0E31 <= i && i <= 0x0E3F
2979 return Char.GetUnicodeCategory ((char) i) ==
2980 UnicodeCategory.NonSpacingMark;
2983 // We can reuse IsIgnorableSymbol testcode
2984 // for IsIgnorableNonSpacing.
2990 public byte Category;
2992 public byte Level2; // It is always single byte.
2993 public bool Defined;
2995 public CharMapEntry (byte category, byte level1, byte level2)
2997 Category = category;
3006 public readonly int CP;
3007 public readonly int JIS;
3009 public JISCharacter (int cp, int cpJIS)
3016 class JISComparer : IComparer
3018 public static readonly JISComparer Instance =
3021 public int Compare (object o1, object o2)
3023 JISCharacter j1 = (JISCharacter) o1;
3024 JISCharacter j2 = (JISCharacter) o2;
3025 return j2.JIS - j1.JIS;
3029 class NonJISCharacter
3031 public readonly int CP;
3032 public readonly string Name;
3034 public NonJISCharacter (int cp, string name)
3041 class NonJISComparer : IComparer
3043 public static readonly NonJISComparer Instance =
3044 new NonJISComparer ();
3046 public int Compare (object o1, object o2)
3048 NonJISCharacter j1 = (NonJISCharacter) o1;
3049 NonJISCharacter j2 = (NonJISCharacter) o2;
3050 return string.CompareOrdinal (j1.Name, j2.Name);
3054 class DecimalDictionaryValueComparer : IComparer
3056 public static readonly DecimalDictionaryValueComparer Instance
3057 = new DecimalDictionaryValueComparer ();
3059 private DecimalDictionaryValueComparer ()
3063 public int Compare (object o1, object o2)
3065 DictionaryEntry e1 = (DictionaryEntry) o1;
3066 DictionaryEntry e2 = (DictionaryEntry) o2;
3067 // FIXME: in case of 0, compare decomposition categories
3068 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3071 int i1 = (int) e1.Key;
3072 int i2 = (int) e2.Key;
3077 class StringDictionaryValueComparer : IComparer
3079 public static readonly StringDictionaryValueComparer Instance
3080 = new StringDictionaryValueComparer ();
3082 private StringDictionaryValueComparer ()
3086 public int Compare (object o1, object o2)
3088 DictionaryEntry e1 = (DictionaryEntry) o1;
3089 DictionaryEntry e2 = (DictionaryEntry) o2;
3090 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3093 int i1 = (int) e1.Key;
3094 int i2 = (int) e2.Key;
3099 class UCAComparer : IComparer
3101 public static readonly UCAComparer Instance
3102 = new UCAComparer ();
3104 private UCAComparer ()
3108 public int Compare (object o1, object o2)
3110 char i1 = (char) o1;
3111 char i2 = (char) o2;
3113 int l1 = CollationElementTable.GetSortKeyCount (i1);
3114 int l2 = CollationElementTable.GetSortKeyCount (i2);
3115 int l = l1 > l2 ? l2 : l1;
3117 for (int i = 0; i < l; i++) {
3118 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3119 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3120 int v = k1.Primary - k2.Primary;
3123 v = k1.Secondary - k2.Secondary;
3126 v = k1.Thirtiary - k2.Thirtiary;
3129 v = k1.Quarternary - k2.Quarternary;
3142 ArrayList items = new ArrayList ();
3144 public Tailoring (int lcid)
3149 public Tailoring (int lcid, int alias)
3156 get { return lcid; }
3160 get { return alias; }
3163 public bool FrenchSort {
3164 get { return frenchSort; }
3165 set { frenchSort = value; }
3168 public void AddDiacriticalMap (byte target, byte replace)
3170 items.Add (new DiacriticalMap (target, replace));
3173 public void AddSortKeyMap (string source, byte [] sortkey)
3175 items.Add (new SortKeyMap (source, sortkey));
3178 public void AddReplacementMap (string source, string replace)
3180 items.Add (new ReplacementMap (source, replace));
3183 public char [] ItemToCharArray ()
3185 ArrayList al = new ArrayList ();
3186 foreach (ITailoringMap m in items)
3187 al.AddRange (m.ToCharArray ());
3188 return al.ToArray (typeof (char)) as char [];
3191 interface ITailoringMap
3193 char [] ToCharArray ();
3196 class DiacriticalMap : ITailoringMap
3198 public readonly byte Target;
3199 public readonly byte Replace;
3201 public DiacriticalMap (byte target, byte replace)
3207 public char [] ToCharArray ()
3209 char [] ret = new char [3];
3210 ret [0] = (char) 02; // kind:DiacriticalMap
3211 ret [1] = (char) Target;
3212 ret [2] = (char) Replace;
3217 class SortKeyMap : ITailoringMap
3219 public readonly string Source;
3220 public readonly byte [] SortKey;
3222 public SortKeyMap (string source, byte [] sortkey)
3228 public char [] ToCharArray ()
3230 char [] ret = new char [Source.Length + 7];
3231 ret [0] = (char) 01; // kind:SortKeyMap
3232 for (int i = 0; i < Source.Length; i++)
3233 ret [i + 1] = Source [i];
3235 for (int i = 0; i < 5; i++)
3236 ret [i + Source.Length + 2] = (char) SortKey [i];
3241 class ReplacementMap : ITailoringMap
3243 public readonly string Source;
3244 public readonly string Replace;
3246 public ReplacementMap (string source, string replace)
3252 public char [] ToCharArray ()
3254 char [] ret = new char [Source.Length + Replace.Length + 3];
3255 ret [0] = (char) 03; // kind:ReplaceMap
3257 for (int i = 0; i < Source.Length; i++)
3258 ret [pos++] = Source [i];
3261 for (int i = 0; i < Replace.Length; i++)
3262 ret [pos++] = Replace [i];