3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
102 "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
104 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
105 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
106 "WITH OGONEK;", "WITH CEDILLA;",
108 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
109 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
111 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
112 " DIAERESIS AND GRAVE;",
114 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
115 " MACRON AND ACUTE;",
116 " MACRON AND GRAVE;",
118 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
119 " RING ABOVE AND ACUTE",
120 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
121 " CIRCUMFLEX AND TILDE",
122 " TILDE AND DIAERESIS",
125 " CEDILLA AND BREVE",
126 " OGONEK AND MACRON",
129 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
133 " PRECEDED BY APOSTROPHE",
135 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
138 " RETROFLEX;", "DIAERESIS BELOW",
141 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
142 " BREVE BELOW;", " HORN AND GRAVE",
145 " DOT BELOW AND DOT ABOVE",
146 " RIGHT HALF RING", " HORN AND TILDE",
147 " CIRCUMFLEX AND DOT BELOW",
148 " BREVE AND DOT BELOW",
149 " DOT BELOW AND MACRON",
151 " HORN AND HOOK ABOVE",
153 // CIRCLED, PARENTHESIZED and so on
154 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
155 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
156 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
158 byte [] diacriticWeights = new byte [] {
162 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
163 0x17, 0x19, 0x1A, 0x1B, 0x1C,
165 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
166 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
168 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
169 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
171 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
172 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
174 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
175 0x69, 0x69, 0x6A, 0x6D, 0x6E,
177 // CIRCLED, PARENTHESIZED and so on.
178 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
182 int [] numberSecondaryWeightBounds = new int [] {
183 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
184 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
185 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
186 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
187 0xE50, 0xE60, 0xED0, 0xEE0
190 char [] orderedGurmukhi;
191 char [] orderedGujarati;
192 char [] orderedGeorgian;
193 char [] orderedThaana;
195 static readonly char [] orderedTamilConsonants = new char [] {
196 // based on traditional Tamil consonants, except for
197 // Grantha (where Microsoft breaks traditionalism).
198 // http://www.angelfire.com/empire/thamizh/padanGaL
199 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
200 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
201 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
202 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
205 // cp -> character name (only for some characters)
206 ArrayList sortableCharNames = new ArrayList ();
208 // cp -> arrow value (int)
209 ArrayList arrowValues = new ArrayList ();
211 // cp -> box value (int)
212 ArrayList boxValues = new ArrayList ();
214 // cp -> level1 value
215 Hashtable arabicLetterPrimaryValues = new Hashtable ();
218 Hashtable arabicNameMap = new Hashtable ();
220 // cp -> Hashtable [decompType] -> cp
221 Hashtable nfkdMap = new Hashtable ();
223 // Latin letter -> ArrayList [int]
224 Hashtable latinMap = new Hashtable ();
226 ArrayList jisJapanese = new ArrayList ();
227 ArrayList nonJisJapanese = new ArrayList ();
229 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
230 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
231 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
232 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
233 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
235 byte [] ignorableFlags = new byte [char.MaxValue + 1];
237 static double [] unicodeAge = new double [char.MaxValue + 1];
239 ArrayList tailorings = new ArrayList ();
241 void Run (string [] args)
243 string dirname = args.Length == 0 ? "downloaded" : args [0];
244 ParseSources (dirname);
245 Console.Error.WriteLine ("parse done.");
247 ModifyParsedValues ();
249 Console.Error.WriteLine ("generation done.");
251 Console.Error.WriteLine ("serialization done.");
253 StreamWriter sw = new StreamWriter ("agelog.txt");
254 for (int i = 0; i < char.MaxValue; i++) {
255 bool shouldBe = false;
256 switch (Char.GetUnicodeCategory ((char) i)) {
257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
258 shouldBe = true; break;
260 if (unicodeAge [i] >= 3.1)
262 //if (IsIgnorable (i) != shouldBe)
263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
269 byte [] CompressArray (byte [] source, CodePointIndexer i)
271 return (byte []) CodePointIndexer.CompressArray (
272 source, typeof (byte), i);
275 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
277 return (ushort []) CodePointIndexer.CompressArray (
278 source, typeof (ushort), i);
284 SerializeTailorings ();
286 byte [] categories = new byte [map.Length];
287 byte [] level1 = new byte [map.Length];
288 byte [] level2 = new byte [map.Length];
289 byte [] level3 = new byte [map.Length];
290 ushort [] widthCompat = new ushort [map.Length];
291 for (int i = 0; i < map.Length; i++) {
292 categories [i] = map [i].Category;
293 level1 [i] = map [i].Level1;
294 level2 [i] = map [i].Level2;
295 level3 [i] = ComputeLevel3Weight ((char) i);
296 switch (decompType [i]) {
297 case DecompositionNarrow:
298 case DecompositionWide:
299 case DecompositionSuper:
300 case DecompositionSub:
301 // they are always 1 char
302 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
308 ignorableFlags = CompressArray (ignorableFlags,
309 MSCompatUnicodeTableUtil.Ignorable);
310 categories = CompressArray (categories,
311 MSCompatUnicodeTableUtil.Category);
312 level1 = CompressArray (level1,
313 MSCompatUnicodeTableUtil.Level1);
314 level2 = CompressArray (level2,
315 MSCompatUnicodeTableUtil.Level2);
316 level3 = CompressArray (level3,
317 MSCompatUnicodeTableUtil.Level3);
318 widthCompat = (ushort []) CodePointIndexer.CompressArray (
319 widthCompat, typeof (ushort),
320 MSCompatUnicodeTableUtil.WidthCompat);
321 cjkCHS = CompressArray (cjkCHS,
322 MSCompatUnicodeTableUtil.CjkCHS);
323 cjkCHT = CompressArray (cjkCHT,
324 MSCompatUnicodeTableUtil.Cjk);
325 cjkJA = CompressArray (cjkJA,
326 MSCompatUnicodeTableUtil.Cjk);
327 cjkKO = CompressArray (cjkKO,
328 MSCompatUnicodeTableUtil.Cjk);
329 cjkKOlv2 = CompressArray (cjkKOlv2,
330 MSCompatUnicodeTableUtil.Cjk);
333 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
335 MemoryStream ms = new MemoryStream ();
336 BinaryWriter binary = new BinaryWriter (ms);
337 binary.Write (ignorableFlags.Length);
339 for (int i = 0; i < ignorableFlags.Length; i++) {
340 byte value = ignorableFlags [i];
342 Result.Write ("{0},", value);
344 Result.Write ("0x{0:X02},", value);
346 binary.Write (value);
348 if ((i & 0xF) == 0xF)
349 Result.WriteLine ("// {0:X04}", i - 0xF);
351 Result.WriteLine ("};");
355 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
357 binary.Write (categories.Length);
359 for (int i = 0; i < categories.Length; i++) {
360 byte value = categories [i];
362 Result.Write ("{0},", value);
364 Result.Write ("0x{0:X02},", value);
366 binary.Write (value);
368 if ((i & 0xF) == 0xF)
369 Result.WriteLine ("// {0:X04}", i - 0xF);
371 Result.WriteLine ("};");
374 // Primary weight value
375 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
377 binary.Write (level1.Length);
379 for (int i = 0; i < level1.Length; i++) {
380 byte value = level1 [i];
382 Result.Write ("{0},", value);
384 Result.Write ("0x{0:X02},", value);
386 binary.Write (value);
388 if ((i & 0xF) == 0xF)
389 Result.WriteLine ("// {0:X04}", i - 0xF);
391 Result.WriteLine ("};");
395 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
397 binary.Write (level2.Length);
399 for (int i = 0; i < level2.Length; i++) {
400 byte value = level2 [i];
402 Result.Write ("{0},", value);
404 Result.Write ("0x{0:X02},", value);
406 binary.Write (value);
408 if ((i & 0xF) == 0xF)
409 Result.WriteLine ("// {0:X04}", i - 0xF);
411 Result.WriteLine ("};");
415 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
417 binary.Write (level3.Length);
419 for (int i = 0; i < level3.Length; i++) {
420 byte value = level3 [i];
422 Result.Write ("{0},", value);
424 Result.Write ("0x{0:X02},", value);
426 binary.Write (value);
428 if ((i & 0xF) == 0xF)
429 Result.WriteLine ("// {0:X04}", i - 0xF);
431 Result.WriteLine ("};");
434 // Width insensitivity mappings
435 // (for now it is more lightweight than dumping the
436 // entire NFKD table).
437 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
439 binary.Write (widthCompat.Length);
441 for (int i = 0; i < widthCompat.Length; i++) {
442 ushort value = widthCompat [i];
444 Result.Write ("{0},", value);
446 Result.Write ("0x{0:X02},", value);
448 binary.Write (value);
450 if ((i & 0xF) == 0xF)
451 Result.WriteLine ("// {0:X04}", i - 0xF);
453 Result.WriteLine ("};");
456 using (FileStream fs = File.Create ("../collation.core.bin")) {
457 byte [] array = ms.ToArray ();
458 fs.Write (array, 0, array.Length);
463 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
464 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
465 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
466 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
467 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
470 void SerializeCJK (string name, ushort [] cjk, int max)
472 int offset = 0;//char.MaxValue - cjk.Length;
473 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
475 MemoryStream ms = new MemoryStream ();
476 BinaryWriter binary = new BinaryWriter (ms);
477 binary.Write (cjk.Length);
479 for (int i = 0; i < cjk.Length; i++) {
480 if (i + offset == max)
482 ushort value = cjk [i];
484 Result.Write ("{0},", value);
486 Result.Write ("0x{0:X04},", value);
488 binary.Write (value);
490 if ((i & 0xF) == 0xF)
491 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
493 Result.WriteLine ("};");
496 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
497 byte [] array = ms.ToArray ();
498 fs.Write (array, 0, array.Length);
503 void SerializeCJK (string name, byte [] cjk, int max)
505 int offset = 0;//char.MaxValue - cjk.Length;
506 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
508 MemoryStream ms = new MemoryStream ();
509 BinaryWriter binary = new BinaryWriter (ms);
511 for (int i = 0; i < cjk.Length; i++) {
512 if (i + offset == max)
514 byte value = cjk [i];
516 Result.Write ("{0},", value);
518 Result.Write ("0x{0:X02},", value);
520 binary.Write (value);
522 if ((i & 0xF) == 0xF)
523 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
525 Result.WriteLine ("};");
528 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
529 byte [] array = ms.ToArray ();
530 fs.Write (array, 0, array.Length);
535 void SerializeTailorings ()
537 Hashtable indexes = new Hashtable ();
538 Hashtable counts = new Hashtable ();
539 Result.WriteLine ("static char [] tailorings = new char [] {");
542 MemoryStream ms = new MemoryStream ();
543 BinaryWriter binary = new BinaryWriter (ms);
545 foreach (Tailoring t in tailorings) {
548 Result.Write ("/*{0}*/", t.LCID);
549 indexes.Add (t.LCID, count);
550 char [] values = t.ItemToCharArray ();
551 counts.Add (t.LCID, values.Length);
552 foreach (char c in values) {
553 Result.Write ("'\\x{0:X}', ", (int) c);
554 if (++count % 16 == 0)
555 Result.WriteLine (" // {0:X04}", count - 16);
557 binary.Write ((ushort) c);
561 Result.WriteLine ("};");
563 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
565 byte [] rawdata = ms.ToArray ();
566 ms = new MemoryStream ();
567 binary = new BinaryWriter (ms);
568 binary.Write (tailorings.Count);
570 foreach (Tailoring t in tailorings) {
571 int target = t.Alias != 0 ? t.Alias : t.LCID;
572 if (!indexes.ContainsKey (target)) {
573 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
576 int idx = (int) indexes [target];
577 int cnt = (int) counts [target];
578 bool french = t.FrenchSort;
580 foreach (Tailoring t2 in tailorings)
581 if (t2.LCID == t.LCID)
582 french = t2.FrenchSort;
583 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
585 binary.Write (t.LCID);
588 binary.Write (french);
591 Result.WriteLine ("};");
593 binary.Write ((byte) 0xFF);
594 binary.Write ((byte) 0xFF);
595 binary.Write (rawdata.Length / 2);
596 binary.Write (rawdata, 0, rawdata.Length);
599 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
600 byte [] array = ms.ToArray ();
601 fs.Write (array, 0, array.Length);
608 void ParseSources (string dirname)
611 dirname + "/UnicodeData.txt";
612 string derivedCoreProps =
613 dirname + "/DerivedCoreProperties.txt";
615 dirname + "/Scripts.txt";
617 dirname + "/CP932.TXT";
619 dirname + "/DerivedAge.txt";
620 string chXML = dirname + "/common/collation/zh.xml";
621 string jaXML = dirname + "/common/collation/ja.xml";
622 string koXML = dirname + "/common/collation/ko.xml";
624 ParseDerivedAge (derivedAge);
628 ParseJISOrder (cp932); // in prior to ParseUnidata()
629 ParseUnidata (unidata);
631 ParseDerivedCoreProperties (derivedCoreProps);
632 ParseScripts (scripts);
633 ParseCJK (chXML, jaXML, koXML);
635 ParseTailorings ("mono-tailoring-source.txt");
638 void ParseTailorings (string filename)
642 using (StreamReader sr = new StreamReader (filename)) {
644 while (sr.Peek () >= 0) {
646 ProcessTailoringLine (ref t,
647 sr.ReadLine ().Trim ());
649 } catch (Exception) {
650 Console.Error.WriteLine ("ERROR at line {0}", line);
656 // For now this is enough.
657 string ParseTailoringSourceValue (string s)
659 StringBuilder sb = new StringBuilder ();
660 for (int i = 0; i < s.Length; i++) {
661 if (s.StartsWith ("\\u")) {
662 sb.Append ((char) int.Parse (
663 s.Substring (2, 4), NumberStyles.HexNumber),
670 return sb.ToString ();
673 void ProcessTailoringLine (ref Tailoring t, string s)
675 int idx = s.IndexOf ('#');
677 s = s.Substring (0, idx).Trim ();
678 if (s.Length == 0 || s [0] == '#')
681 idx = s.IndexOf ('=');
684 int.Parse (s.Substring (1, idx - 1)),
685 int.Parse (s.Substring (idx + 1)));
687 t = new Tailoring (int.Parse (s.Substring (1)));
691 if (s.StartsWith ("*FrenchSort")) {
695 string d = "*Diacritical";
696 if (s.StartsWith (d)) {
697 idx = s.IndexOf ("->");
698 t.AddDiacriticalMap (
699 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
700 NumberStyles.HexNumber),
701 byte.Parse (s.Substring (idx + 2).Trim (),
702 NumberStyles.HexNumber));
705 idx = s.IndexOf (':');
707 string source = s.Substring (0, idx).Trim ();
708 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
709 byte [] b = new byte [4];
710 for (int i = 0; i < 4; i++) {
714 b [i] = byte.Parse (l [i],
715 NumberStyles.HexNumber);
717 t.AddSortKeyMap (ParseTailoringSourceValue (source),
720 idx = s.IndexOf ('=');
722 t.AddReplacementMap (
723 ParseTailoringSourceValue (
724 s.Substring (0, idx).Trim ()),
725 ParseTailoringSourceValue (
726 s.Substring (idx + 1).Trim ()));
729 void ParseDerivedAge (string filename)
731 using (StreamReader file =
732 new StreamReader (filename)) {
733 while (file.Peek () >= 0) {
734 string s = file.ReadLine ();
735 int idx = s.IndexOf ('#');
737 s = s.Substring (0, idx);
738 idx = s.IndexOf (';');
742 string cpspec = s.Substring (0, idx);
743 idx = cpspec.IndexOf ("..");
744 NumberStyles nf = NumberStyles.HexNumber |
745 NumberStyles.AllowTrailingWhite;
746 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
747 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
748 string value = s.Substring (cpspec.Length + 1).Trim ();
751 if (cp > char.MaxValue)
754 double v = double.Parse (value);
755 for (int i = cp; i <= cpEnd; i++)
759 unicodeAge [0] = double.MaxValue; // never be supported
762 void ParseUnidata (string filename)
764 ArrayList decompValues = new ArrayList ();
765 using (StreamReader unidata =
766 new StreamReader (filename)) {
767 for (int line = 1; unidata.Peek () >= 0; line++) {
769 ProcessUnidataLine (unidata.ReadLine (), decompValues);
770 } catch (Exception) {
771 Console.Error.WriteLine ("**** At line " + line);
776 this.decompValues = (int [])
777 decompValues.ToArray (typeof (int));
780 char previousLatinTarget = char.MinValue;
781 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
783 void ProcessUnidataLine (string s, ArrayList decompValues)
785 int idx = s.IndexOf ('#');
787 s = s.Substring (0, idx);
788 idx = s.IndexOf (';');
791 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
792 string [] values = s.Substring (idx + 1).Split (';');
795 if (cp > char.MaxValue)
797 if (IsIgnorable (cp))
800 string name = values [0];
802 // SPECIAL CASE: rename some characters for diacritical
803 // remapping. FIXME: why are they different?
804 // FIXME: it's still not working.
805 if (cp == 0x018B || cp == 0x018C)
806 name = name.Replace ("TOPBAR", "STROKE");
809 if (s.IndexOf ("SMALL CAPITAL") > 0)
810 isSmallCapital [cp] = true;
812 // latin mapping by character name
813 if (s.IndexOf ("LATIN") >= 0) {
814 int lidx = s.IndexOf ("LETTER DOTLESS ");
815 int offset = lidx + 15;
817 lidx = s.IndexOf ("LETTER TURNED ");
821 lidx = s.IndexOf ("LETTER CAPITAL ");
825 lidx = s.IndexOf ("LETTER SCRIPT ");
829 lidx = s.IndexOf ("LETTER ");
832 char c = lidx > 0 ? s [offset] : char.MinValue;
833 char n = s [offset + 1];
834 char target = char.MinValue;
835 if ('A' <= c && c <= 'Z' &&
836 (n == ' ') || n == ';') {
838 // FIXME: After 'Z', I cannot reset this state.
839 previousLatinTarget = c == 'Z' ? char.MinValue : c;
842 if (s.Substring (offset).StartsWith ("ALPHA"))
844 else if (s.Substring (offset).StartsWith ("TONE SIX"))
846 else if (s.Substring (offset).StartsWith ("OPEN O"))
848 else if (s.Substring (offset).StartsWith ("SCHWA"))
850 else if (s.Substring (offset).StartsWith ("ENG"))
852 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
854 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
856 else if (s.Substring (offset).StartsWith ("TONE TWO"))
858 else if (s.Substring (offset).StartsWith ("ESH"))
861 if (target == char.MinValue)
862 target = previousLatinTarget;
864 if (target != char.MinValue) {
865 ArrayList entry = (ArrayList) latinMap [target];
867 entry = new ArrayList ();
868 latinMap [target] = entry;
871 // FIXME: This secondary weight is hack.
872 // They are here because they must not
873 // be identical to the corresponding
875 if (c != target && diacritical [cp] == 0) {
876 diacriticalOffset [c - 'A']++;
877 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
883 if (0x2000 <= cp && cp < 0x3000) {
885 // SPECIAL CASES. FIXME: why?
887 case 0x21C5: value = -1; break; // E2
888 case 0x261D: value = 1; break;
889 case 0x27A6: value = 3; break;
890 case 0x21B0: value = 7; break;
891 case 0x21B1: value = 3; break;
892 case 0x21B2: value = 7; break;
893 case 0x21B4: value = 5; break;
894 case 0x21B5: value = 7; break;
895 case 0x21B9: value = -1; break; // E1
896 case 0x21CF: value = 7; break;
897 case 0x21D0: value = 3; break;
899 string [] arrowTargets = new string [] {
911 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
912 if (s.IndexOf (arrowTargets [i]) > 0 &&
913 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
914 s.IndexOf (" OVER") < 0
918 arrowValues.Add (new DictionaryEntry (
923 if (0x2500 <= cp && cp < 0x2600) {
926 // up:1 down:2 right:4 left:8 vert:16 horiz:32
929 // [dr] [dl] [ur] [ul]
933 ArrayList flags = new ArrayList (new int [] {
936 4 + 2, 8 + 2, 4 + 1, 8 + 1,
937 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
938 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
939 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
941 byte [] offsets = new byte [] {
948 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
950 if (s.IndexOf (" UP") >= 0)
952 if (s.IndexOf (" DOWN") >= 0)
954 if (s.IndexOf (" RIGHT") >= 0)
956 if (s.IndexOf (" LEFT") >= 0)
958 if (s.IndexOf (" VERTICAL") >= 0)
960 if (s.IndexOf (" HORIZONTAL") >= 0)
963 int fidx = flags.IndexOf (flag);
964 value = fidx < 0 ? fidx : offsets [fidx];
965 } else if (s.IndexOf ("BLOCK") >= 0) {
966 if (s.IndexOf ("ONE EIGHTH") >= 0)
968 else if (s.IndexOf ("ONE QUARTER") >= 0)
970 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
972 else if (s.IndexOf ("HALF") >= 0)
974 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
976 else if (s.IndexOf ("THREE QUARTERS") >= 0)
978 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
983 else if (s.IndexOf ("SHADE") >= 0)
985 else if (s.IndexOf ("SQUARE") >= 0)
987 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
989 else if (s.IndexOf ("RECTANGLE") >= 0)
991 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
993 else if (s.IndexOf ("TRIANGLE") >= 0) {
994 if (s.IndexOf ("UP-POINTING") >= 0)
996 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
998 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1000 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1001 value = 0xC3 - 0xE5;
1003 else if (s.IndexOf ("POINTER") >= 0) {
1004 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1005 value = 0xC4 - 0xE5;
1006 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1007 value = 0xC5 - 0xE5;
1009 else if (s.IndexOf ("DIAMOND") >= 0)
1010 value = 0xC6 - 0xE5;
1011 else if (s.IndexOf ("FISHEYE") >= 0)
1012 value = 0xC7 - 0xE5;
1013 else if (s.IndexOf ("LOZENGE") >= 0)
1014 value = 0xC8 - 0xE5;
1015 else if (s.IndexOf ("BULLSEYE") >= 0)
1016 value = 0xC9 - 0xE5;
1017 else if (s.IndexOf ("CIRCLE") >= 0) {
1018 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1019 value = 0xCA - 0xE5;
1020 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1021 value = 0xCB - 0xE5;
1023 value = 0xC9 - 0xE5;
1025 if (0x25DA <= cp && cp <= 0x25E5)
1026 value = 0xCD + cp - 0x25DA - 0xE5;
1028 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1030 case 0x2571: value = 0xF; break;
1031 case 0x2572: value = 0x10; break;
1032 case 0x2573: value = 0x11; break;
1035 boxValues.Add (new DictionaryEntry (
1039 // For some characters store the name and sort later
1040 // to determine sorting.
1041 if (0x2100 <= cp && cp <= 0x213F &&
1042 Char.IsSymbol ((char) cp))
1043 sortableCharNames.Add (
1044 new DictionaryEntry (cp, name));
1045 else if (0x3380 <= cp && cp <= 0x33DD)
1046 sortableCharNames.Add (new DictionaryEntry (
1047 cp, name.Substring (7)));
1049 if (Char.GetUnicodeCategory ((char) cp) ==
1050 UnicodeCategory.MathSymbol) {
1051 if (name.StartsWith ("CIRCLED "))
1052 diacritical [cp] = 0xEE;
1053 if (name.StartsWith ("SQUARED "))
1054 diacritical [cp] = 0xEF;
1057 // diacritical weights by character name
1058 if (diacritics.Length != diacriticWeights.Length)
1059 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1060 for (int d = 0; d < diacritics.Length; d++) {
1061 if (s.IndexOf (diacritics [d]) > 0) {
1062 diacritical [cp] += diacriticWeights [d];
1063 if (s.IndexOf ("COMBINING") >= 0)
1064 diacritical [cp] -= (byte) 2;
1067 // also process "COMBINING blah" here
1068 // For now it is limited to cp < 0x0370
1069 // if (cp < 0x0300 || cp >= 0x0370)
1071 string tmp = diacritics [d].TrimEnd (';');
1072 if (tmp.IndexOf ("WITH ") == 0)
1073 tmp = tmp.Substring (4);
1074 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1076 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1078 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1080 // Two-step grep required for it.
1081 if (s.IndexOf ("FULL STOP") > 0 &&
1082 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1083 diacritical [cp] |= 0xF4;
1085 // Arabic letter name
1086 if (0x0621 <= cp && cp <= 0x064A &&
1087 Char.GetUnicodeCategory ((char) cp)
1088 == UnicodeCategory.OtherLetter) {
1089 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1094 // hamza, waw, yeh ... special cases.
1099 value = 0x77; // special cases.
1102 // Get primary letter name i.e.
1103 // XXX part of ARABIC LETTER XXX yyy
1104 // e.g. that of "TEH MARBUTA" is "TEH".
1107 // 0x0640 is special: it does
1108 // not start with ARABIC LETTER
1110 name.Substring (14);
1111 int tmpIdx = letterName.IndexOf (' ');
1112 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1113 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1114 if (arabicNameMap.ContainsKey (letterName))
1115 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1117 arabicNameMap [letterName] = cp;
1120 arabicLetterPrimaryValues [cp] = value;
1123 // Japanese square letter
1124 if (0x3300 <= cp && cp <= 0x3357)
1125 if (!ExistsJIS (cp))
1126 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1128 // normalizationType
1129 string decomp = values [4];
1130 idx = decomp.IndexOf ('<');
1132 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1134 decompType [cp] = DecompositionFull;
1137 decompType [cp] = DecompositionSub;
1140 decompType [cp] = DecompositionSuper;
1143 decompType [cp] = DecompositionSmall;
1146 decompType [cp] = DecompositionIsolated;
1149 decompType [cp] = DecompositionInitial;
1152 decompType [cp] = DecompositionFinal;
1155 decompType [cp] = DecompositionMedial;
1158 decompType [cp] = DecompositionNoBreak;
1161 decompType [cp] = DecompositionCompat;
1164 decompType [cp] = DecompositionFraction;
1167 decompType [cp] = DecompositionFont;
1170 decompType [cp] = DecompositionCircle;
1173 decompType [cp] = DecompositionSquare;
1176 decompType [cp] = DecompositionWide;
1179 decompType [cp] = DecompositionNarrow;
1182 decompType [cp] = DecompositionVertical;
1185 throw new Exception ("Support NFKD type : " + decomp);
1189 decompType [cp] = DecompositionCanonical;
1190 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1191 if (decomp.Length > 0) {
1193 string [] velems = decomp.Split (' ');
1194 int didx = decompValues.Count;
1195 decompIndex [cp] = didx;
1196 foreach (string v in velems)
1197 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1198 decompLength [cp] = velems.Length;
1200 // [decmpType] -> this_cp
1201 int targetCP = (int) decompValues [didx];
1202 // for "(x)" it specially maps to 'x' .
1203 // FIXME: check if it is sane
1204 if (velems.Length == 3 &&
1205 (int) decompValues [didx] == '(' &&
1206 (int) decompValues [didx + 2] == ')')
1207 targetCP = (int) decompValues [didx + 1];
1208 // special: 0x215F "1/"
1209 else if (cp == 0x215F)
1211 else if (velems.Length > 1 &&
1212 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1213 // skip them, except for CJK ideograph compat
1216 if (targetCP != 0) {
1217 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1218 if (entry == null) {
1219 entry = new Hashtable ();
1220 nfkdMap [targetCP] = entry;
1222 entry [(byte) decompType [cp]] = cp;
1226 if (values [5].Length > 0)
1227 decimalValue [cp] = decimal.Parse (values [5]);
1228 else if (values [6].Length > 0)
1229 decimalValue [cp] = decimal.Parse (values [6]);
1230 else if (values [7].Length > 0) {
1231 string decstr = values [7];
1232 idx = decstr.IndexOf ('/');
1233 if (cp == 0x215F) // special. "1/"
1234 decimalValue [cp] = 0x1;
1238 decimal.Parse (decstr.Substring (0, idx))
1239 / decimal.Parse (decstr.Substring (idx + 1));
1240 else if (decstr [0] == '(' &&
1241 decstr [decstr.Length - 1] == ')')
1244 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1245 else if (decstr [decstr.Length - 1] == '.')
1248 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1250 decimalValue [cp] = decimal.Parse (decstr);
1254 void ParseDerivedCoreProperties (string filename)
1257 using (StreamReader file =
1258 new StreamReader (filename)) {
1259 for (int line = 1; file.Peek () >= 0; line++) {
1261 ProcessDerivedCorePropLine (file.ReadLine ());
1262 } catch (Exception) {
1263 Console.Error.WriteLine ("**** At line " + line);
1270 void ProcessDerivedCorePropLine (string s)
1272 int idx = s.IndexOf ('#');
1274 s = s.Substring (0, idx);
1275 idx = s.IndexOf (';');
1278 string cpspec = s.Substring (0, idx);
1279 idx = cpspec.IndexOf ("..");
1280 NumberStyles nf = NumberStyles.HexNumber |
1281 NumberStyles.AllowTrailingWhite;
1282 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1283 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1284 string value = s.Substring (cpspec.Length + 1).Trim ();
1287 if (cp > char.MaxValue)
1292 for (int x = cp; x <= cpEnd; x++)
1293 isUppercase [x] = true;
1298 void ParseScripts (string filename)
1300 ArrayList gurmukhi = new ArrayList ();
1301 ArrayList gujarati = new ArrayList ();
1302 ArrayList georgian = new ArrayList ();
1303 ArrayList thaana = new ArrayList ();
1305 using (StreamReader file =
1306 new StreamReader (filename)) {
1307 while (file.Peek () >= 0) {
1308 string s = file.ReadLine ();
1309 int idx = s.IndexOf ('#');
1311 s = s.Substring (0, idx);
1312 idx = s.IndexOf (';');
1316 string cpspec = s.Substring (0, idx);
1317 idx = cpspec.IndexOf ("..");
1318 NumberStyles nf = NumberStyles.HexNumber |
1319 NumberStyles.AllowTrailingWhite;
1320 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1321 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1322 string value = s.Substring (cpspec.Length + 1).Trim ();
1325 if (cp > char.MaxValue)
1330 for (int x = cp; x <= cpEnd; x++)
1331 if (!IsIgnorable (x))
1332 gurmukhi.Add ((char) x);
1335 for (int x = cp; x <= cpEnd; x++)
1336 if (!IsIgnorable (x))
1337 gujarati.Add ((char) x);
1340 for (int x = cp; x <= cpEnd; x++)
1341 if (!IsIgnorable (x))
1342 georgian.Add ((char) x);
1345 for (int x = cp; x <= cpEnd; x++)
1346 if (!IsIgnorable (x))
1347 thaana.Add ((char) x);
1352 gurmukhi.Sort (UCAComparer.Instance);
1353 gujarati.Sort (UCAComparer.Instance);
1354 georgian.Sort (UCAComparer.Instance);
1355 thaana.Sort (UCAComparer.Instance);
1356 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1357 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1358 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1359 orderedThaana = (char []) thaana.ToArray (typeof (char));
1362 void ParseJISOrder (string filename)
1366 using (StreamReader file =
1367 new StreamReader (filename)) {
1368 for (;file.Peek () >= 0; line++)
1369 ProcessJISOrderLine (file.ReadLine ());
1371 } catch (Exception) {
1372 Console.Error.WriteLine ("---- line {0}", line);
1377 char [] ws = new char [] {'\t', ' '};
1379 void ProcessJISOrderLine (string s)
1381 int idx = s.IndexOf ('#');
1383 s = s.Substring (0, idx).Trim ();
1386 idx = s.IndexOfAny (ws);
1389 // They start with "0x" so cut them out.
1390 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1391 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1392 jisJapanese.Add (new JISCharacter (cp, jis));
1395 void ParseCJK (string zhXML, string jaXML, string koXML)
1397 XmlDocument doc = new XmlDocument ();
1398 doc.XmlResolver = null;
1405 // Chinese Simplified
1408 offset = 0;//char.MaxValue - arr.Length;
1410 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1412 foreach (char c in s) {
1414 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1416 arr [(int) c - offset] = (ushort) v++;
1422 // Chinese Traditional
1425 offset = 0;//char.MaxValue - arr.Length;
1426 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1428 foreach (char c in s) {
1430 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1432 arr [(int) c - offset] = (ushort) v++;
1441 offset = 0;//char.MaxValue - arr.Length;
1444 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1445 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1446 arr [0x337E] = 0x8005;
1447 arr [0x337D] = 0x8006;
1448 arr [0x337C] = 0x8007;
1451 foreach (JISCharacter jc in jisJapanese) {
1452 if (jc.JIS < 0x8800)
1454 char c = (char) jc.CP;
1457 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1459 arr [(int) c - offset] = (ushort) v++;
1464 if (c == '\u662D') // U+337C
1466 if (c == '\u5927') // U+337D
1468 if (c == '\u5E73') // U+337B
1470 if (c == '\u660E') // U+337E
1472 if (c == '\u9686') // U+F9DC
1475 // FIXME: there are still remaining
1476 // characters after U+FA0C.
1477 // for (int k = 0; k < char.MaxValue; k++) {
1478 for (int k = 0; k < '\uFA0D'; k++) {
1479 if (decompIndex [k] == 0 || IsIgnorable (k))
1481 if (decompValues [decompIndex [k]] == c /*&&
1482 decompLength [k] == 1*/ ||
1483 decompLength [k] == 3 &&
1484 decompValues [decompIndex [k] + 1] == c) {
1485 arr [k - offset] = (ushort) v++;
1494 // Korean weight is somewhat complex. It first shifts
1495 // Hangul category from 52-x to 80-x (they are anyways
1496 // computed). CJK ideographs are placed at secondary
1497 // weight, like XX YY 01 zz 01, where XX and YY are
1498 // corresponding "reset" value and zz is 41,43,45...
1500 // Unlike chs,cht and ja, Korean value is a combined
1501 // ushort which is computed as category
1505 offset = 0;//char.MaxValue - arr.Length;
1507 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1508 XmlElement sc = (XmlElement) reset.NextSibling;
1509 // compute "category" and "level 1" for the
1510 // target "reset" Hangle syllable
1511 char rc = reset.InnerText [0];
1512 int ri = ((int) rc - 0xAC00) + 1;
1514 ((ri / 254) * 256 + (ri % 254) + 2);
1515 // Place the characters after the target.
1518 foreach (char c in s) {
1519 arr [(int) c - offset] = p;
1520 cjkKOlv2 [(int) c - offset] = (byte) v;
1530 void FillIgnorables ()
1532 for (int i = 0; i <= char.MaxValue; i++) {
1533 if (Char.GetUnicodeCategory ((char) i) ==
1534 UnicodeCategory.OtherNotAssigned)
1536 if (IsIgnorable (i))
1537 ignorableFlags [i] |= 1;
1538 if (IsIgnorableSymbol (i))
1539 ignorableFlags [i] |= 2;
1540 if (IsIgnorableNonSpacing (i))
1541 ignorableFlags [i] |= 4;
1545 void ModifyUnidata ()
1547 // Modify some decomposition equivalence
1548 decompType [0xFE31] = 0;
1549 decompIndex [0xFE31] = 0;
1550 decompLength [0xFE31] = 0;
1551 decompType [0xFE32] = 0;
1552 decompIndex [0xFE32] = 0;
1553 decompLength [0xFE32] = 0;
1555 // Korean parens numbers
1556 for (int i = 0x3200; i <= 0x321C; i++)
1557 diacritical [i] = 0xA;
1558 for (int i = 0x3260; i <= 0x327B; i++)
1559 diacritical [i] = 0xC;
1561 // LAMESPEC: these remapping should not be done.
1562 // Windows have incorrect CJK compat mappings.
1563 decompValues [decompIndex [0x32A9]] = 0x91AB;
1564 decompLength [0x323B] = 1;
1565 decompValues [decompIndex [0x323B]] = 0x5B78;
1566 decompValues [decompIndex [0x32AB]] = 0x5B78;
1567 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1568 decompLength [0x3238] = 1;
1569 decompValues [decompIndex [0x3238]] = 0x52DE;
1570 decompValues [decompIndex [0x3298]] = 0x52DE;
1572 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1573 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1574 decompValues [decompIndex [0xFA0C]] = 0x5140;
1575 decompLength [0xFA0C] = 1;
1576 decompIndex [0xF929] = decompLength [0xF929] = 0;
1578 decompValues [decompIndex [0xF92C]] = 0x90DE;
1581 void ModifyParsedValues ()
1583 // number, secondary weights
1585 int [] numarr = numberSecondaryWeightBounds;
1586 for (int i = 0; i < numarr.Length; i += 2, weight++)
1587 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1588 if (Char.IsNumber ((char) cp))
1589 diacritical [cp] = weight;
1591 // Update name part of named characters
1592 for (int i = 0; i < sortableCharNames.Count; i++) {
1593 DictionaryEntry de =
1594 (DictionaryEntry) sortableCharNames [i];
1595 int cp = (int) de.Key;
1596 string renamed = null;
1598 case 0x2101: renamed = "A_1"; break;
1599 case 0x33C3: renamed = "A_2"; break;
1600 case 0x2105: renamed = "C_1"; break;
1601 case 0x2106: renamed = "C_2"; break;
1602 case 0x211E: renamed = "R1"; break;
1603 case 0x211F: renamed = "R2"; break;
1604 // Remove some of them!
1615 sortableCharNames.RemoveAt (i);
1619 if (renamed != null)
1620 sortableCharNames [i] =
1621 new DictionaryEntry (cp, renamed);
1625 void GenerateCore ()
1629 #region Specially ignored // 01
1630 // This will raise "Defined" flag up.
1631 foreach (char c in specialIgnore)
1632 map [(int) c] = new CharMapEntry (0, 0, 0);
1636 #region Variable weights
1637 // Controls : 06 03 - 06 3D
1639 for (int i = 0; i < 65536; i++) {
1640 if (IsIgnorable (i))
1643 uc = Char.GetUnicodeCategory (c);
1644 // NEL is whitespace but not ignored here.
1645 if (uc == UnicodeCategory.Control &&
1646 !Char.IsWhiteSpace (c) || c == '\u0085')
1647 AddCharMap (c, 6, 1);
1651 fillIndex [6] = 0x80;
1652 AddCharMapGroup ('\'', 6, 1, 0);
1653 AddCharMap ('\uFE63', 6, 1);
1655 // Hyphen/Dash : 06 81 - 06 90
1656 for (int i = 0; i < char.MaxValue; i++) {
1657 if (!IsIgnorable (i) &&
1658 Char.GetUnicodeCategory ((char) i) ==
1659 UnicodeCategory.DashPunctuation) {
1660 AddCharMapGroup2 ((char) i, 6, 1, 0);
1662 // SPECIAL: add 2027 and 2043
1663 // Maybe they are regarded the
1664 // same hyphens in "central"
1666 AddCharMap ('\u2027', 6, 1);
1667 AddCharMap ('\u2043', 6, 1);
1672 // Arabic variable weight chars 06 A0 -
1673 fillIndex [6] = 0xA0;
1675 for (int i = 0x64B; i <= 0x650; i++)
1676 AddArabicCharMap ((char) i);
1678 AddCharMapGroup ('\u0652', 6, 1, 0);
1680 AddCharMapGroup ('\u0651', 6, 1, 0);
1684 #region Nonspacing marks // 01
1685 // FIXME: 01 03 - 01 B6 ... annoyance :(
1687 // Combining diacritical marks: 01 DC -
1689 fillIndex [0x1] = 0x41;
1690 for (int i = 0x030E; i <= 0x0326; i++)
1691 if (!IsIgnorable (i))
1692 AddCharMap ((char) i, 0x1, 1);
1693 for (int i = 0x0329; i <= 0x0334; i++)
1694 if (!IsIgnorable (i))
1695 AddCharMap ((char) i, 0x1, 1);
1696 for (int i = 0x0339; i <= 0x0341; i++)
1697 if (!IsIgnorable (i))
1698 AddCharMap ((char) i, 0x1, 1);
1699 fillIndex [0x1] = 0x72;
1700 for (int i = 0x0346; i <= 0x0348; i++)
1701 if (!IsIgnorable (i))
1702 AddCharMap ((char) i, 0x1, 1);
1703 for (int i = 0x02BE; i <= 0x02BF; i++)
1704 if (!IsIgnorable (i))
1705 AddCharMap ((char) i, 0x1, 1);
1706 for (int i = 0x02C1; i <= 0x02C5; i++)
1707 if (!IsIgnorable (i))
1708 AddCharMap ((char) i, 0x1, 1);
1709 for (int i = 0x02CE; i <= 0x02CF; i++)
1710 if (!IsIgnorable (i))
1711 AddCharMap ((char) i, 0x1, 1);
1712 for (int i = 0x02D1; i <= 0x02D3; i++)
1713 if (!IsIgnorable (i))
1714 AddCharMap ((char) i, 0x1, 1);
1715 AddCharMap ('\u02DE', 0x1, 1);
1716 for (int i = 0x02E4; i <= 0x02E9; i++)
1717 if (!IsIgnorable (i))
1718 AddCharMap ((char) i, 0x1, 1);
1720 // FIXME: needs more love here (it should eliminate
1721 // all the hacky code above).
1722 for (int i = 0x0300; i < 0x0370; i++)
1723 if (!IsIgnorable (i) && diacritical [i] != 0
1724 /* especiall here*/ && !map [i].Defined)
1725 map [i] = new CharMapEntry (
1726 0x1, 0x1, diacritical [i]);
1728 fillIndex [0x1] = 0x94;
1729 // syriac dotted nonspacing marks
1730 AddCharMap ('\u0732', 0x1, 1);
1731 AddCharMap ('\u0735', 0x1, 1);
1732 AddCharMap ('\u0738', 0x1, 1);
1733 AddCharMap ('\u0739', 0x1, 1);
1734 AddCharMap ('\u073C', 0x1, 1);
1735 fillIndex [0x1] = 0x9F;
1736 for (int i = 0x0730; i <= 0x07B0; i++)
1737 if (!IsIgnorable (i) && !map [i].Defined)
1738 AddCharMap ((char) i, 0x1, 1);
1740 fillIndex [0x1] = 0x0C;
1741 for (int i = 0x0EC8; i <= 0x0ECD; i++)
1742 if (!IsIgnorable (i))
1743 AddCharMap ((char) i, 0x1, 1);
1745 // LAMESPEC: It should not stop at '\u20E1'. There are
1746 // a few more characters (that however results in
1747 // overflow of level 2 unless we start before 0xDD).
1748 fillIndex [0x1] = 0xDD;
1749 for (int i = 0x20d0; i <= 0x20e1; i++)
1750 AddCharMap ((char) i, 0x1, 1);
1752 // They are not part of Nonspacing marks, but have
1753 // only diacritical weight.
1754 for (int i = 0x3099; i <= 0x309C; i++)
1755 map [i] = new CharMapEntry (1, 1, 1);
1756 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1757 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1758 for (int i = 0x30FC; i <= 0x30FE; i++)
1759 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1764 #region Whitespaces // 07 03 -
1765 fillIndex [0x7] = 0x2;
1766 AddCharMap (' ', 0x7, 2);
1767 AddCharMap ('\u00A0', 0x7, 1);
1768 for (int i = 9; i <= 0xD; i++)
1769 AddCharMap ((char) i, 0x7, 1);
1770 for (int i = 0x2000; i <= 0x200B; i++)
1771 AddCharMap ((char) i, 0x7, 1);
1773 fillIndex [0x7] = 0x17;
1774 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1775 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1777 // Characters which used to represent layout control.
1778 // LAMESPEC: Windows developers seem to have thought
1779 // that those characters are kind of whitespaces,
1780 // while they aren't.
1781 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1782 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1785 // category 09 - continued symbols from 08
1786 fillIndex [0x9] = 2;
1788 for (int cp = 0x2300; cp <= 0x237A; cp++)
1789 AddCharMap ((char) cp, 0x9, 1, 0);
1792 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1793 foreach (DictionaryEntry de in arrowValues) {
1794 int idx = (int) de.Value;
1795 int cp = (int) de.Key;
1796 if (map [cp].Defined)
1798 fillIndex [0x9] = (byte) (0xD8 + idx);
1799 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1803 byte [] boxLv2 = new byte [128];
1804 for (int i = 0; i < boxLv2.Length; i++)
1806 foreach (DictionaryEntry de in boxValues) {
1807 int cp = (int) de.Key;
1808 int off = (int) de.Value;
1809 if (map [cp].Defined)
1812 fillIndex [0x9] = (byte) (0xE5 + off);
1813 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1816 fillIndex [0x9] = (byte) (0xE5 + off);
1817 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1820 // Some special characters (slanted)
1821 fillIndex [0x9] = 0xF4;
1822 AddCharMap ('\u2571', 0x9, 3);
1823 AddCharMap ('\u2572', 0x9, 3);
1824 AddCharMap ('\u2573', 0x9, 3);
1826 // FIXME: implement 0A
1828 fillIndex [0xA] = 2;
1829 // byte currency symbols
1830 for (int cp = 0; cp < 0x100; cp++) {
1831 uc = Char.GetUnicodeCategory ((char) cp);
1832 if (!IsIgnorable (cp) &&
1833 uc == UnicodeCategory.CurrencySymbol &&
1836 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1838 // byte other symbols
1839 for (int cp = 0; cp < 0x100; cp++) {
1841 continue; // SPECIAL: skip FIXME: why?
1842 uc = Char.GetUnicodeCategory ((char) cp);
1843 if (!IsIgnorable (cp) &&
1844 uc == UnicodeCategory.OtherSymbol ||
1845 cp == '\u00B5' || cp == '\u00B7')
1846 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1849 fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1850 for (int cp = 0x2020; cp <= 0x2031; cp++)
1851 if (Char.IsPunctuation ((char) cp))
1852 AddCharMap ((char) cp, 0xA, 1, 0);
1853 // SPECIAL CASES: why?
1854 AddCharMap ('\u203B', 0xA, 1, 0);
1855 AddCharMap ('\u2040', 0xA, 1, 0);
1856 AddCharMap ('\u2041', 0xA, 1, 0);
1857 AddCharMap ('\u2042', 0xA, 1, 0);
1859 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1860 AddCharMap ((char) cp, 0xA, 1, 0);
1861 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1862 for (int cp = 0x2600; cp <= 0x2613; cp++)
1863 AddCharMap ((char) cp, 0xA, 1, 0);
1865 for (int cp = 0x2620; cp <= 0x2770; cp++)
1866 if (Char.IsSymbol ((char) cp))
1867 AddCharMap ((char) cp, 0xA, 1, 0);
1869 for (int i = 0x2440; i < 0x2460; i++)
1870 AddCharMap ((char) i, 0xA, 1, 0);
1874 #region Numbers // 0C 02 - 0C E1
1875 fillIndex [0xC] = 2;
1877 // 9F8 : Bengali "one less than the denominator"
1878 AddCharMap ('\u09F8', 0xC, 1);
1880 ArrayList numbers = new ArrayList ();
1881 for (int i = 0; i < 65536; i++)
1882 if (!IsIgnorable (i) &&
1883 Char.IsNumber ((char) i) &&
1884 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1887 ArrayList numberValues = new ArrayList ();
1888 foreach (int i in numbers)
1889 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1890 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1892 //foreach (DictionaryEntry de in numberValues)
1893 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1895 decimal prevValue = -1;
1896 foreach (DictionaryEntry de in numberValues) {
1897 int cp = (int) de.Key;
1898 decimal currValue = (decimal) de.Value;
1899 bool addnew = false;
1900 if (prevValue < currValue &&
1901 prevValue - (int) prevValue == 0 &&
1905 // Process Hangzhou and Roman numbers
1907 // There are some SPECIAL cases.
1908 if (currValue != 4) // no increment for 4
1912 if (currValue <= 10) {
1913 xcp = (int) prevValue + 0x2170 - 1;
1914 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1915 xcp = (int) prevValue + 0x2160 - 1;
1916 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1917 fillIndex [0xC] += 2;
1918 xcp = (int) prevValue + 0x3021 - 1;
1919 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1922 else if (currValue == 11)
1925 if (prevValue < currValue)
1926 prevValue = currValue;
1927 if (map [cp].Defined)
1929 // HangZhou and Roman are add later
1931 else if (0x3021 <= cp && cp < 0x302A
1932 || 0x2160 <= cp && cp < 0x216A
1933 || 0x2170 <= cp && cp < 0x217A)
1936 if (cp == 0x215B) // FIXME: why?
1937 fillIndex [0xC] += 2;
1938 else if (cp == 0x3021) // FIXME: why?
1940 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1941 if (addnew || cp <= '9') {
1942 int mod = (int) currValue - 1;
1944 if (1 <= currValue && currValue <= 10) {
1946 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1948 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1950 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1952 if (1 <= currValue && currValue <= 20) {
1954 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1956 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1958 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1962 if (cp != 0x09E7 && cp != 0x09EA)
1965 // Add special cases that are not regarded as
1966 // numbers in UnicodeCategory speak.
1969 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1970 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1972 else if (cp == '6') // FIXME: why?
1977 fillIndex [0xC] = 0xFF;
1978 AddCharMap ('\u221E', 0xC, 1);
1981 #region Letters and NonSpacing Marks (general)
1983 // ASCII Latin alphabets
1984 for (int i = 0; i < alphabets.Length; i++)
1985 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1988 // non-ASCII Latin alphabets
1989 // FIXME: there is no such characters that are placed
1990 // *after* "alphabets" array items. This is nothing
1991 // more than a hack that creates dummy weight for
1992 // primary characters.
1993 for (int i = 0x0080; i < 0x0300; i++) {
1994 if (!Char.IsLetter ((char) i))
1996 // For those Latin Letters which has NFKD are
1997 // not added as independent primary character.
1998 if (decompIndex [i] != 0)
2001 // 1.some alphabets have primarily
2002 // equivalent ASCII alphabets.
2003 // 2.some have independent primary weights,
2004 // but inside a-to-z range.
2005 // 3.there are some expanded characters that
2006 // are not part of Unicode Standard NFKD.
2007 // 4. some characters are letter in IsLetter
2008 // but not in sortkeys (maybe unicode version
2009 // difference caused it).
2011 // 1. skipping them does not make sense
2012 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2013 // case 0x184: case 0x185: case 0x186: case 0x189:
2014 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2015 // case 0x194: case 0x195: case 0x196: case 0x19A:
2016 // case 0x19B: case 0x19C:
2017 // 2. skipping them does not make sense
2018 // case 0x14A: // Ng
2019 // case 0x14B: // ng
2023 case 0xDE: // Icelandic Thorn
2024 case 0xFE: // Icelandic Thorn
2025 case 0xDF: // German ss
2026 case 0xFF: // German ss
2028 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2029 // not classified yet
2030 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2031 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2032 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2036 AddCharMapGroup ((char) i, 0xE, 1, 0);
2040 fillIndex [0xF] = 02;
2041 for (int i = 0x0380; i < 0x0390; i++)
2042 if (Char.IsLetter ((char) i))
2043 AddLetterMap ((char) i, 0xF, 1);
2044 fillIndex [0xF] = 02;
2045 for (int i = 0x0391; i < 0x03CF; i++)
2046 if (Char.IsLetter ((char) i))
2047 AddLetterMap ((char) i, 0xF, 1);
2048 fillIndex [0xF] = 0x40;
2049 for (int i = 0x03D0; i < 0x0400; i++)
2050 if (Char.IsLetter ((char) i))
2051 AddLetterMap ((char) i, 0xF, 1);
2054 // Cyrillic letters are sorted like Latin letters i.e.
2055 // containing culture-specific letters between the
2056 // standard Cyrillic sequence.
2058 // We can't use UCA here; it has different sorting.
2059 char [] orderedCyrillic = new char [] {
2060 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2061 '\u0452', // DJE for Serbocroatian
2063 '\u0454', // IE for Ukrainian
2067 '\u0456', // Byelorussian-Ukrainian I
2077 '\u043F', '\u0440', '\u0441', '\u0442',
2078 '\u045B', // TSHE for Serbocroatian
2080 '\u045E', // Short U for Byelorussian
2081 '\u04B1', // Straight U w/ stroke (diacritical!)
2082 '\u0444', '\u0445', '\u0446', '\u0447',
2084 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2085 '\u044D', '\u044E', '\u044F'};
2087 // For some characters here is a map to basic cyrillic
2088 // letters. See UnicodeData.txt character names for
2089 // the sources. Here I simply declare an equiv. array.
2090 // The content characters are map from U+490(,491),
2091 // skipping small letters.
2092 char [] cymap_src = new char [] {
2093 '\u0433', '\u0433', '\u0433', '\u0436',
2094 '\u0437', '\u043A', '\u043A', '\u043A',
2095 '\u043A', '\u043D', '\u043D', '\u043F',
2096 '\u0445', '\u0441', '\u0442', '\u0443',
2097 '\u0443', '\u0445', '\u0446', '\u0447',
2098 '\u0447', '\u0432', '\u0435', '\u0435',
2099 '\u0406', '\u0436', '\u043A', '\u043D',
2100 '\u0447', '\u0435'};
2102 fillIndex [0x10] = 0x8D;
2103 for (int i = 0x0460; i < 0x0481; i++) {
2104 if (Char.IsLetter ((char) i)) {
2106 // U+476/477 have the same
2107 // primary weight as U+474/475.
2108 fillIndex [0x10] -= 3;
2109 AddLetterMap ((char) i, 0x10, 3);
2113 fillIndex [0x10] = 0x6;
2114 for (int i = 0; i < orderedCyrillic.Length; i++) {
2115 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2116 if (!IsIgnorable ((int) c) &&
2117 Char.IsLetter (c) &&
2119 AddLetterMap (c, 0x10, 0);
2120 fillIndex [0x10] += 3;
2124 for (int i = 0; i < cymap_src.Length; i++) {
2125 char c = cymap_src [i];
2126 fillIndex [0x10] = map [c].Level1;
2127 AddLetterMap ((char) (0x0490 + i * 2),
2132 fillIndex [0x11] = 0x3;
2133 for (int i = 0x0531; i < 0x0586; i++)
2134 if (Char.IsLetter ((char) i))
2135 AddLetterMap ((char) i, 0x11, 1);
2139 fillIndex [0x12] = 0x2;
2140 for (int i = 0x05D0; i < 0x05FF; i++)
2141 if (Char.IsLetter ((char) i))
2142 AddLetterMap ((char) i, 0x12, 1);
2144 fillIndex [0x1] = 0x3;
2145 for (int i = 0x0591; i <= 0x05C2; i++) {
2146 if (i == 0x05A3 || i == 0x05BB)
2149 AddCharMap ((char) i, 0x1, 1);
2153 fillIndex [0x1] = 0x8E;
2154 fillIndex [0x13] = 0x3;
2155 for (int i = 0x0621; i <= 0x064A; i++) {
2157 if (Char.GetUnicodeCategory ((char) i)
2158 != UnicodeCategory.OtherLetter) {
2159 // FIXME: arabic nonspacing marks are
2160 // in different order.
2161 AddCharMap ((char) i, 0x1, 1);
2164 // map [i] = new CharMapEntry (0x13,
2165 // (byte) arabicLetterPrimaryValues [i], 1);
2167 (byte) arabicLetterPrimaryValues [i];
2168 AddLetterMap ((char) i, 0x13, 0);
2170 fillIndex [0x13] = 0x84;
2171 for (int i = 0x0674; i < 0x06D6; i++)
2172 if (Char.IsLetter ((char) i))
2173 AddLetterMap ((char) i, 0x13, 1);
2176 // FIXME: it does seem straight codepoint mapping.
2177 fillIndex [0x14] = 04;
2178 for (int i = 0x0901; i < 0x0905; i++)
2179 if (!IsIgnorable (i))
2180 AddLetterMap ((char) i, 0x14, 2);
2181 fillIndex [0x14] = 0xB;
2182 for (int i = 0x0905; i < 0x093A; i++) {
2184 AddCharMap ('\u0929', 0x14, 0, 8);
2186 AddCharMap ('\u0931', 0x14, 0, 8);
2188 AddCharMap ('\u0934', 0x14, 0, 8);
2189 if (Char.IsLetter ((char) i))
2190 AddLetterMap ((char) i, 0x14, 4);
2192 AddCharMap ('\u0960', 0x14, 4);
2194 AddCharMap ('\u0961', 0x14, 4);
2196 fillIndex [0x14] = 0xDA;
2197 for (int i = 0x093E; i < 0x0945; i++)
2198 if (!IsIgnorable (i))
2199 AddLetterMap ((char) i, 0x14, 2);
2200 fillIndex [0x14] = 0xEC;
2201 for (int i = 0x0945; i < 0x094F; i++)
2202 if (!IsIgnorable (i))
2203 AddLetterMap ((char) i, 0x14, 2);
2207 fillIndex [0x15] = 02;
2208 for (int i = 0x0980; i < 0x9FF; i++) {
2209 if (IsIgnorable (i))
2212 fillIndex [0x15] = 0x3B;
2213 switch (Char.GetUnicodeCategory ((char) i)) {
2214 case UnicodeCategory.NonSpacingMark:
2215 case UnicodeCategory.DecimalDigitNumber:
2216 case UnicodeCategory.OtherNumber:
2219 AddLetterMap ((char) i, 0x15, 1);
2222 fillIndex [0x1] = 0x3;
2223 for (int i = 0x0981; i < 0x0A00; i++)
2224 if (Char.GetUnicodeCategory ((char) i) ==
2225 UnicodeCategory.NonSpacingMark)
2226 AddCharMap ((char) i, 0x1, 1);
2228 // Gurmukhi. orderedGurmukhi is from UCA
2229 // FIXME: it does not look equivalent to UCA.
2230 fillIndex [0x16] = 04;
2231 fillIndex [0x1] = 3;
2232 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2233 char c = orderedGurmukhi [i];
2234 if (IsIgnorable ((int) c))
2236 if (IsIgnorableNonSpacing (c)) {
2237 AddLetterMap (c, 0x1, 1);
2240 if (c == '\u0A3C' || c == '\u0A4D' ||
2241 '\u0A66' <= c && c <= '\u0A71')
2243 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2245 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2247 AddLetterMap (c, 0x16, shift);
2250 // Gujarati. orderedGujarati is from UCA
2251 fillIndex [0x17] = 0x4;
2253 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2254 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2255 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2256 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2257 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2258 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2259 // letters go first.
2260 for (int i = 0; i < orderedGujarati.Length; i++) {
2262 char c = orderedGujarati [i];
2263 if (Char.IsLetter (c)) {
2265 if (c == '\u0AB3' || c == '\u0A32')
2267 if (c == '\u0A33') {
2268 AddCharMap ('\u0A32', 0x17, 0);
2269 AddCharMap ('\u0A33', 0x17, 4, 4);
2273 AddCharMap ('\u0AE0', 0x17, 0, 5);
2274 AddCharMap (c, 0x17, 4);
2277 AddCharMap ('\u0AB3', 0x17, 6);
2281 byte gujaratiShift = 4;
2282 fillIndex [0x17] = 0xC0;
2283 for (int i = 0; i < orderedGujarati.Length; i++) {
2284 char c = orderedGujarati [i];
2285 if (fillIndex [0x17] == 0xCC)
2287 if (!Char.IsLetter (c)) {
2290 AddCharMap ('\u0A81', 0x17, 2);
2293 AddLetterMap (c, 0x17, gujaratiShift);
2298 fillIndex [0x1] = 03;
2299 fillIndex [0x18] = 02;
2300 for (int i = 0x0B00; i < 0x0B7F; i++) {
2301 switch (Char.GetUnicodeCategory ((char) i)) {
2302 case UnicodeCategory.NonSpacingMark:
2303 case UnicodeCategory.DecimalDigitNumber:
2304 AddLetterMap ((char) i, 0x1, 1);
2307 AddLetterMap ((char) i, 0x18, 1);
2311 fillIndex [0x19] = 2;
2312 AddCharMap ('\u0BD7', 0x19, 0);
2313 fillIndex [0x19] = 0xA;
2315 for (int i = 0x0B82; i <= 0x0B94; i++)
2316 if (!IsIgnorable ((char) i))
2317 AddCharMap ((char) i, 0x19, 2);
2319 fillIndex [0x19] = 0x28;
2320 // The array for Tamil consonants is a constant.
2321 // Windows have almost similar sequence to TAM from
2322 // tamilnet but a bit different in Grantha.
2323 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2324 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2326 fillIndex [0x19] = 0x82;
2327 for (int i = 0x0BBE; i < 0x0BCD; i++)
2328 if (Char.GetUnicodeCategory ((char) i) ==
2329 UnicodeCategory.SpacingCombiningMark
2331 AddLetterMap ((char) i, 0x19, 2);
2334 fillIndex [0x1A] = 0x4;
2335 for (int i = 0x0C00; i < 0x0C62; i++) {
2336 if (i == 0x0C55 || i == 0x0C56)
2338 AddCharMap ((char) i, 0x1A, 3);
2339 char supp = (i == 0x0C0B) ? '\u0C60':
2340 i == 0x0C0C ? '\u0C61' : char.MinValue;
2341 if (supp == char.MinValue)
2343 AddCharMap (supp, 0x1A, 3);
2347 fillIndex [0x1B] = 4;
2348 for (int i = 0x0C80; i < 0x0CE5; i++) {
2349 if (i == 0x0CD5 || i == 0x0CD6)
2351 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2352 continue; // shift after 0xCB9
2353 AddCharMap ((char) i, 0x1B, 3);
2355 // SPECIAL CASES: but why?
2356 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2357 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2358 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2361 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2365 fillIndex [0x1C] = 2;
2366 for (int i = 0x0D02; i < 0x0D61; i++)
2367 // FIXME: I avoided MSCompatUnicodeTable usage
2368 // here (it results in recursion). So check if
2369 // using NonSpacingMark makes sense or not.
2370 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2371 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2372 AddCharMap ((char) i, 0x1C, 1);
2374 // Thai ... note that it breaks 0x1E wall after E2B!
2375 // Also, all Thai characters have level 2 value 3.
2376 fillIndex [0x1E] = 2;
2377 for (int i = 0xE40; i <= 0xE44; i++)
2378 AddCharMap ((char) i, 0x1E, 1, 3);
2379 for (int i = 0xE01; i < 0xE2B; i++)
2380 AddCharMap ((char) i, 0x1E, 6, 3);
2381 fillIndex [0x1F] = 5;
2382 for (int i = 0xE2B; i < 0xE30; i++)
2383 AddCharMap ((char) i, 0x1F, 6, 3);
2384 fillIndex [0x1F] = 0x1E;
2385 for (int i = 0xE30; i < 0xE3B; i++)
2386 AddCharMap ((char) i, 0x1F, 1, 3);
2387 // some Thai characters remains.
2388 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2389 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2390 foreach (char c in specialThai)
2391 AddCharMap (c, 0x1F, 1);
2394 fillIndex [0x1F] = 2;
2395 for (int i = 0xE80; i < 0xEDF; i++)
2396 if (Char.IsLetter ((char) i))
2397 AddCharMap ((char) i, 0x1F, 1);
2399 // Georgian. orderedGeorgian is from UCA DUCET.
2400 fillIndex [0x21] = 5;
2401 for (int i = 0; i < orderedGeorgian.Length; i++) {
2402 char c = orderedGeorgian [i];
2403 if (map [(int) c].Defined)
2405 AddCharMap (c, 0x21, 0);
2407 AddCharMap ((char) (c - 0x30), 0x21, 0);
2408 fillIndex [0x21] += 5;
2412 fillIndex [0x22] = 2;
2413 int kanaOffset = 0x3041;
2414 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2416 for (int gyo = 0; gyo < 9; gyo++) {
2417 for (int dan = 0; dan < 5; dan++) {
2418 if (gyo == 7 && dan % 2 == 1) {
2421 kanaOffset -= 2; // There is no space for yi and ye.
2424 int cp = kanaOffset + dan * kanaLines [gyo];
2425 // small lines (a-gyo, ya-gyo)
2426 if (gyo == 0 || gyo == 7) {
2427 AddKanaMap (cp, 1); // small
2428 AddKanaMap (cp + 1, 1);
2431 AddKanaMap (cp, kanaLines [gyo]);
2435 // add small 'ka' (before normal one)
2436 AddKanaMap (0x30F5, 1);
2440 // add small 'ke' (before normal one)
2441 AddKanaMap (0x30F6, 1);
2445 // add small 'Tsu' (before normal one)
2446 AddKanaMap (0x3063, 1);
2450 fillIndex [0x22] += 3;
2451 kanaOffset += 5 * kanaLines [gyo];
2454 // Wa-gyo is almost special, so I just manually add.
2455 AddLetterMap ((char) 0x308E, 0x22, 0);
2456 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2457 AddLetterMap ((char) 0x308F, 0x22, 0);
2458 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2460 AddLetterMap ((char) 0x3090, 0x22, 0);
2461 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2462 fillIndex [0x22] += 2;
2463 // no "Wu" in Japanese.
2464 AddLetterMap ((char) 0x3091, 0x22, 0);
2465 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2467 AddLetterMap ((char) 0x3092, 0x22, 0);
2468 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2470 fillIndex [0x22] = 0x80;
2471 AddLetterMap ((char) 0x3093, 0x22, 0);
2472 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2474 // JIS Japanese square chars.
2475 fillIndex [0x22] = 0x97;
2476 jisJapanese.Sort (JISComparer.Instance);
2477 foreach (JISCharacter j in jisJapanese)
2478 if (0x3300 <= j.CP && j.CP <= 0x3357)
2479 AddCharMap ((char) j.CP, 0x22, 1);
2480 // non-JIS Japanese square chars.
2481 nonJisJapanese.Sort (NonJISComparer.Instance);
2482 foreach (NonJISCharacter j in nonJisJapanese)
2483 AddCharMap ((char) j.CP, 0x22, 1);
2486 fillIndex [0x23] = 0x02;
2487 for (int i = 0x3105; i <= 0x312C; i++)
2488 AddCharMap ((char) i, 0x23, 1);
2490 // Estrangela: ancient Syriac
2491 fillIndex [0x24] = 0x0B;
2492 // FIXME: is 0x71E really alternative form?
2493 ArrayList syriacAlternatives = new ArrayList (
2494 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2495 for (int i = 0x0710; i <= 0x072C; i++) {
2496 if (i == 0x0711) // NonSpacingMark
2498 if (syriacAlternatives.Contains (i))
2500 AddCharMap ((char) i, 0x24, 4);
2505 foreach (int cp in syriacAlternatives)
2506 map [cp] = new CharMapEntry (0x24,
2507 (byte) (map [cp - 1].Level1 + 2),
2509 // FIXME: Syriac NonSpacingMark should go here.
2512 // FIXME: it turned out that it does not look like UCA
2513 fillIndex [0x24] = 0x6E;
2514 for (int i = 0; i < orderedThaana.Length; i++) {
2515 char c = orderedThaana [i];
2516 if (IsIgnorableNonSpacing ((int) c))
2518 AddCharMap (c, 0x24, 2);
2519 if (c == '\u0782') // SPECIAL CASE: why?
2520 fillIndex [0x24] += 2;
2524 // FIXME: Add more culture-specific letters (that are
2525 // not supported in Windows collation) here.
2527 // Surrogate ... they are computed.
2532 // Unlike UCA Windows Hangul sequence mixes Jongseong
2533 // with Choseong sequence as well as Jungseong,
2534 // adjusted to have the same primary weight for the
2535 // same base character. So it is impossible to compute
2538 // Here I introduce an ordered sequence of mixed
2539 // 'commands' and 'characters' that is similar to
2541 // - ',' increases primary weight.
2542 // - [A B] means a range, increasing index
2543 // - {A B} means a range, without increasing index
2544 // - '=' is no operation (it means the characters
2545 // of both sides have the same weight).
2546 // - '>' inserts a Hangul Syllable block that
2547 // contains 0x251 characters.
2548 // - '<' decreases the index
2549 // - '0'-'9' means skip count
2550 // - whitespaces are ignored
2553 string hangulSequence =
2554 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2555 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2556 + "<{\u1113 \u1116}, \u3165,"
2557 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2558 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2559 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2560 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2561 + "[\u11D1 \u11D2], \u11B2,"
2562 + "[\u11D3 \u11D5], \u11B3,"
2563 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2564 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2565 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2566 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2567 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2568 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2569 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2570 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2571 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2572 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2573 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2574 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2575 + "\u11F1,, \u11F2,,,"
2576 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2577 + "<\u114D, \u110D,, >"
2578 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2579 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2580 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2581 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2582 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2586 byte hangulCat = 0x52;
2587 fillIndex [hangulCat] = 0x2;
2589 int syllableBlock = 0;
2590 for (int n = 0; n < hangulSequence.Length; n++) {
2591 char c = hangulSequence [n];
2593 if (Char.IsWhiteSpace (c))
2599 IncrementSequentialIndex (ref hangulCat);
2602 if (fillIndex [hangulCat] == 2)
2603 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2604 fillIndex [hangulCat]--;
2607 IncrementSequentialIndex (ref hangulCat);
2608 for (int l = 0; l < 0x15; l++)
2609 for (int v = 0; v < 0x1C; v++) {
2611 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2612 IncrementSequentialIndex (ref hangulCat);
2617 start = hangulSequence [n + 1];
2618 end = hangulSequence [n + 3];
2619 for (int i = start; i <= end; i++) {
2620 AddCharMap ((char) i, hangulCat, 0);
2622 IncrementSequentialIndex (ref hangulCat);
2624 n += 4; // consumes 5 characters for this operation
2627 start = hangulSequence [n + 1];
2628 end = hangulSequence [n + 3];
2629 for (int i = start; i <= end; i++)
2630 AddCharMap ((char) i, hangulCat, 0);
2631 n += 4; // consumes 5 characters for this operation
2634 AddCharMap (c, hangulCat, 0);
2640 for (int i = 0x3200; i < 0x3300; i++) {
2641 if (IsIgnorable (i) || map [i].Defined)
2645 if (decompLength [i] == 4 &&
2646 decompValues [decompIndex [i]] == '(')
2647 ch = decompIndex [i] + 1;
2649 else if (decompLength [i] == 2 &&
2650 decompValues [decompIndex [i] + 1] == '\u1161')
2651 ch = decompIndex [i];
2652 else if (decompLength [i] == 1)
2653 ch = decompIndex [i];
2656 ch = decompValues [ch];
2657 if (ch < 0x1100 || 0x1200 < ch &&
2658 ch < 0xAC00 || 0xD800 < ch)
2662 int offset = i < 0x3260 ? 1 : 0;
2663 if (0x326E <= i && i <= 0x3273)
2666 map [i] = new CharMapEntry (map [ch].Category,
2667 (byte) (map [ch].Level1 + offset),
2669 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2675 // Letterlike characters and CJK compatibility square
2676 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2677 int [] counts = new int ['Z' - 'A' + 1];
2678 char [] namedChars = new char [sortableCharNames.Count];
2680 foreach (DictionaryEntry de in sortableCharNames) {
2681 counts [((string) de.Value) [0] - 'A']++;
2682 namedChars [nCharNames++] = (char) ((int) de.Key);
2684 nCharNames = 0; // reset
2685 for (int a = 0; a < counts.Length; a++) {
2686 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2687 for (int i = 0; i < counts [a]; i++)
2688 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2689 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2692 // CJK unified ideograph.
2694 fillIndex [cjkCat] = 0x2;
2695 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2696 if (!IsIgnorable (cp))
2697 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2698 // CJK Extensions goes here.
2699 // LAMESPEC: With this Windows style CJK layout, it is
2700 // impossible to add more CJK ideograph i.e. 0x9FA6-
2701 // 0x9FBB can never be added w/o breaking compat.
2702 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2703 if (!IsIgnorable (cp))
2704 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2706 // PrivateUse ... computed.
2707 // remaining Surrogate ... computed.
2709 #region Special "biggest" area (FF FF)
2710 fillIndex [0xFF] = 0xFF;
2711 char [] specialBiggest = new char [] {
2712 '\u3005', '\u3031', '\u3032', '\u309D',
2713 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2714 '\uFE7C', '\uFE7D', '\uFF70'};
2715 foreach (char c in specialBiggest)
2716 AddCharMap (c, 0xFF, 0);
2719 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2720 // non-alphanumeric ASCII except for: + - < = > '
2721 for (int i = 0x21; i < 0x7F; i++) {
2722 if (Char.IsLetterOrDigit ((char) i)
2723 || "+-<=>'".IndexOf ((char) i) >= 0)
2724 continue; // they are not added here.
2725 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2726 // Insert 3001 after ',' and 3002 after '.'
2728 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2730 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2732 AddCharMap ('\uFE30', 0x7, 1, 0);
2736 #region 07 - Punctuations and something else
2737 for (int i = 0xA0; i < char.MaxValue; i++) {
2738 if (IsIgnorable (i))
2741 // FIXME: actually those reset should not be
2742 // done but here I put for easy goal.
2744 fillIndex [0x7] = 0xE2;
2746 fillIndex [0x7] = 0x77;
2758 switch (Char.GetUnicodeCategory ((char) i)) {
2759 case UnicodeCategory.OtherPunctuation:
2760 case UnicodeCategory.ClosePunctuation:
2761 case UnicodeCategory.OpenPunctuation:
2762 case UnicodeCategory.InitialQuotePunctuation:
2763 case UnicodeCategory.FinalQuotePunctuation:
2764 case UnicodeCategory.ModifierSymbol:
2765 // SPECIAL CASES: // 0xA
2766 if (0x2020 <= i && i <= 0x2031)
2768 AddCharMapGroup ((char) i, 0x7, 1, 0);
2771 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2772 goto case UnicodeCategory.OtherPunctuation;
2777 // FIXME: it should not need to reset level 1, but
2778 // it's for easy goal.
2779 fillIndex [0x7] = 0xB6;
2780 for (int i = 0x2400; i <= 0x2421; i++)
2781 AddCharMap ((char) i, 0x7, 1, 0);
2784 // FIXME: for 07 xx we need more love.
2786 // Characters w/ diacritical marks (NFKD)
2787 for (int i = 0; i <= char.MaxValue; i++) {
2788 if (map [i].Defined || IsIgnorable (i))
2790 if (decompIndex [i] == 0)
2793 int start = decompIndex [i];
2794 int primaryChar = decompValues [start];
2797 int length = decompLength [i];
2798 // special processing for parenthesized ones.
2800 decompValues [start] == '(' &&
2801 decompValues [start + 2] == ')') {
2802 primaryChar = decompValues [start + 1];
2806 if (map [primaryChar].Level1 == 0)
2809 for (int l = 1; l < length; l++) {
2810 int c = decompValues [start + l];
2811 if (map [c].Level1 != 0)
2813 secondary += diacritical [c];
2817 map [i] = new CharMapEntry (
2818 map [primaryChar].Category,
2819 map [primaryChar].Level1,
2824 // category 08 - symbols
2825 fillIndex [0x8] = 2;
2826 // Here Windows mapping is not straightforward. It is
2827 // not based on computation but seems manual sorting.
2828 AddCharMapGroup ('+', 0x8, 1, 0); // plus
2829 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2830 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2831 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2832 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2833 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2834 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2835 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2836 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2837 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2838 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2839 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2840 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2842 for (int cp = 0; cp < 0x2300; cp++) {
2843 if (cp == 0xAC) // SPECIAL CASE: skip
2846 cp = 0x2200; // skip to 2200
2847 fillIndex [0x8] = 0x21;
2850 fillIndex [0x8] = 0x3;
2852 fillIndex [0x8] = 0xB9;
2853 if (!map [cp].Defined &&
2854 // Char.GetUnicodeCategory ((char) cp) ==
2855 // UnicodeCategory.MathSymbol)
2856 Char.IsSymbol ((char) cp))
2857 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2858 // SPECIAL CASES: no idea why Windows sorts as such
2861 AddCharMap ('\u227B', 0x8, 1, 0);
2862 AddCharMap ('\u22B1', 0x8, 1, 0);
2865 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2866 AddCharMapGroup ('\u226A', 0x8, 1, 0);
2867 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2868 AddCharMapGroup ('\u226B', 0x8, 1, 0);
2871 AddCharMap ('\u01C0', 0x8, 1, 0);
2872 AddCharMap ('\u01C1', 0x8, 1, 0);
2873 AddCharMap ('\u01C2', 0x8, 1, 0);
2878 #region Level2 adjustment
2880 diacritical [0x624] = 0x5;
2881 diacritical [0x626] = 0x7;
2882 diacritical [0x622] = 0x9;
2883 diacritical [0x623] = 0xA;
2884 diacritical [0x625] = 0xB;
2885 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2886 diacritical [0x64A] = 0x7; // Yaa'
2888 for (int i = 0; i < char.MaxValue; i++) {
2890 byte cat = map [i].Category;
2892 case 0xE: // Latin diacritics
2893 case 0x22: // Japanese: circled characters
2894 mod = diacritical [i];
2896 case 0x13: // Arabic
2897 if (diacritical [i] == 0 && i >= 0xFE8D)
2898 mod = 0x8; // default for arabic
2901 if (0x52 <= cat && cat <= 0x7F) // Hangul
2902 mod = diacritical [i];
2904 map [i] = new CharMapEntry (
2905 cat, map [i].Level1, mod);
2909 // FIXME: this is hack but those NonSpacingMark
2910 // characters and still undefined are likely to
2912 for (int i = 0; i < char.MaxValue; i++)
2913 if (!map [i].Defined &&
2915 Char.GetUnicodeCategory ((char) i) ==
2916 UnicodeCategory.NonSpacingMark)
2917 AddCharMap ((char) i, 1, 1);
2919 // FIXME: this is hack but those Symbol characters
2920 // are likely to fall into 0xA category.
2921 for (int i = 0; i < char.MaxValue; i++)
2922 if (!map [i].Defined &&
2924 Char.IsSymbol ((char) i))
2925 AddCharMap ((char) i, 0xA, 1);
2928 private void IncrementSequentialIndex (ref byte hangulCat)
2930 fillIndex [hangulCat]++;
2931 if (fillIndex [hangulCat] == 0) { // overflown
2933 fillIndex [hangulCat] = 0x2;
2937 // Reset fillIndex to fixed value and call AddLetterMap().
2938 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2940 fillIndex [category] = alphaWeight;
2941 AddLetterMap (c, category, 0);
2943 ArrayList al = latinMap [c] as ArrayList;
2947 foreach (int cp in al)
2948 AddLetterMap ((char) cp, category, 0);
2951 private void AddKanaMap (int i, byte voices)
2953 for (byte b = 0; b < voices; b++) {
2954 char c = (char) (i + b);
2955 byte arg = (byte) (b > 0 ? b + 2 : 0);
2957 AddLetterMapCore (c, 0x22, 0, arg);
2959 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2963 private void AddLetterMap (char c, byte category, byte updateCount)
2965 AddLetterMapCore (c, category, updateCount, 0);
2968 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2971 // <small> updates index
2972 c2 = ToSmallForm (c);
2974 AddCharMapGroup (c2, category, updateCount, level2);
2975 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2976 if (c2 != c && !map [(int) c2].Defined)
2977 AddLetterMapCore (c2, category, 0, level2);
2978 bool doUpdate = true;
2979 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2982 AddCharMapGroup (c, category, 0, level2);
2984 fillIndex [category] += updateCount;
2987 private bool AddCharMap (char c, byte category, byte increment)
2989 return AddCharMap (c, category, increment, 0);
2992 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2994 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2995 return false; // do nothing
2996 map [(int) c] = new CharMapEntry (category,
2997 category == 1 ? alt : fillIndex [category],
2998 category == 1 ? fillIndex [category] : alt);
2999 fillIndex [category] += increment;
3003 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3005 char c2 = ToSmallFormTail (c);
3007 AddCharMap (c2, category, updateCount, 0);
3009 AddCharMap (c, category, updateCount, 0);
3011 c2 = ToFullWidthTail (c);
3013 AddCharMapGroupTail (c2, category, updateCount);
3017 // Adds characters to table in the order below
3018 // (+ increases weight):
3022 // <full> | <super> | <sub>
3023 // <circle> | <wide> (| <narrow>)
3027 // level2 is fixed (does not increase).
3028 int [] sameWeightItems = new int [] {
3029 DecompositionFraction,
3033 DecompositionCircle,
3035 DecompositionNarrow,
3037 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3039 if (map [(int) c].Defined)
3042 char small = char.MinValue;
3043 char vertical = char.MinValue;
3044 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3046 object smv = nfkd [(byte) DecompositionSmall];
3048 small = (char) ((int) smv);
3049 object vv = nfkd [(byte) DecompositionVertical];
3051 vertical = (char) ((int) vv);
3054 // <small> updates index
3055 if (small != char.MinValue)
3056 AddCharMap (small, category, updateCount);
3059 AddCharMap (c, category, 0, level2);
3062 foreach (int weight in sameWeightItems) {
3063 object wv = nfkd [(byte) weight];
3065 AddCharMap ((char) ((int) wv), category, 0, level2);
3069 // update index here.
3070 fillIndex [category] += updateCount;
3072 if (vertical != char.MinValue)
3073 AddCharMap (vertical, category, updateCount, level2);
3076 private void AddCharMapCJK (char c, ref byte category)
3078 AddCharMap (c, category, 0, 0);
3079 IncrementSequentialIndex (ref category);
3081 // Special. I wonder why but Windows skips 9E F9.
3082 if (category == 0x9E && fillIndex [category] == 0xF9)
3083 IncrementSequentialIndex (ref category);
3086 private void AddCharMapGroupCJK (char c, ref byte category)
3088 AddCharMapCJK (c, ref category);
3090 // LAMESPEC: see below.
3091 if (c == '\u5B78') {
3092 AddCharMapCJK ('\u32AB', ref category);
3093 AddCharMapCJK ('\u323B', ref category);
3095 if (c == '\u52DE') {
3096 AddCharMapCJK ('\u3298', ref category);
3097 AddCharMapCJK ('\u3238', ref category);
3100 AddCharMapCJK ('\u32A2', ref category);
3102 // Especially this mapping order totally does
3103 // not make sense to me.
3104 AddCharMapCJK ('\u32A9', ref category);
3106 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3109 for (byte weight = 0; weight <= 0x12; weight++) {
3110 object wv = nfkd [weight];
3115 // Special: they are ignored in this area.
3116 // FIXME: check if it is sane
3117 if (0xF900 <= w && w <= 0xFAD9)
3119 // LAMESPEC: on Windows some of CJK characters
3120 // in 3200-32B0 are incorrectly mapped. They
3121 // mix Chinise and Japanese Kanji when
3122 // ordering those characters.
3124 case 0x32A2: case 0x3298: case 0x3238:
3125 case 0x32A9: case 0x323B: case 0x32AB:
3129 AddCharMapCJK ((char) w, ref category);
3133 // For now it is only for 0x7 category.
3134 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3136 char small = char.MinValue;
3137 char vertical = char.MinValue;
3138 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3140 object smv = nfkd [(byte) DecompositionSmall];
3142 small = (char) ((int) smv);
3143 object vv = nfkd [(byte) DecompositionVertical];
3145 vertical = (char) ((int) vv);
3148 // <small> updates index
3149 if (small != char.MinValue)
3150 // SPECIAL CASE excluded (FIXME: why?)
3151 if (small != '\u2024')
3152 AddCharMap (small, category, updateCount);
3155 AddCharMap (c, category, updateCount, level2);
3157 // Since nfkdMap is problematic to have two or more
3158 // NFKD to an identical character, here I iterate all.
3159 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3160 if (decompLength [c2] == 1 &&
3161 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3162 switch (decompType [c2]) {
3163 case DecompositionCompat:
3164 AddCharMap ((char) c2, category, updateCount, level2);
3170 if (vertical != char.MinValue)
3171 // SPECIAL CASE excluded (FIXME: why?)
3172 if (vertical != '\uFE33' && vertical != '\uFE34')
3173 AddCharMap (vertical, category, updateCount, level2);
3176 private void AddArabicCharMap (char c)
3179 byte updateCount = 1;
3183 AddCharMap (c, category, 0, level2);
3185 // Since nfkdMap is problematic to have two or more
3186 // NFKD to an identical character, here I iterate all.
3187 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3188 if (decompLength [c2] == 0)
3190 int idx = decompIndex [c2] + decompLength [c2] - 1;
3191 if ((int) (decompValues [idx]) == (int) c)
3192 AddCharMap ((char) c2, category,
3195 fillIndex [category] += updateCount;
3198 char ToFullWidth (char c)
3200 return ToDecomposed (c, DecompositionFull, false);
3203 char ToFullWidthTail (char c)
3205 return ToDecomposed (c, DecompositionFull, true);
3208 char ToSmallForm (char c)
3210 return ToDecomposed (c, DecompositionSmall, false);
3213 char ToSmallFormTail (char c)
3215 return ToDecomposed (c, DecompositionSmall, true);
3218 char ToDecomposed (char c, byte d, bool tail)
3220 if (decompType [(int) c] != d)
3222 int idx = decompIndex [(int) c];
3224 idx += decompLength [(int) c] - 1;
3225 return (char) decompValues [idx];
3228 bool ExistsJIS (int cp)
3230 foreach (JISCharacter j in jisJapanese)
3238 #region Level 3 properties (Case/Width)
3240 private byte ComputeLevel3Weight (char c)
3242 byte b = ComputeLevel3WeightRaw (c);
3243 return b > 0 ? (byte) (b + 2) : b;
3246 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3249 if ('\u3192' <= c && c <= '\u319F')
3251 // Japanese reading marks
3252 if (c == '\u3001' || c == '\u3002')
3255 if ('\u11A8' <= c && c <= '\u11F9')
3257 if ('\uFFA0' <= c && c <= '\uFFDC')
3259 if ('\u3130' <= c && c <= '\u3164')
3261 if ('\u3165' <= c && c <= '\u318E')
3263 // Georgian Capital letters
3264 if ('\u10A0' <= c && c <= '\u10C5')
3267 if ('\u2776' <= c && c <= '\u277F')
3269 if ('\u2780' <= c && c <= '\u2789')
3271 if ('\u2776' <= c && c <= '\u2793')
3273 if ('\u2160' <= c && c <= '\u216F')
3275 if ('\u2181' <= c && c <= '\u2182')
3278 if ('\u2135' <= c && c <= '\u2138')
3280 if ('\uFE80' <= c && c < '\uFF00') {
3281 // 2(Isolated)/8(Final)/0x18(Medial)
3282 switch (decompType [(int) c]) {
3283 case DecompositionIsolated:
3285 case DecompositionFinal:
3287 case DecompositionMedial:
3292 // actually I dunno the reason why they have weights.
3315 switch (decompType [(int) c]) {
3316 case DecompositionWide: // <wide>
3317 case DecompositionSub: // <sub>
3318 case DecompositionSuper: // <super>
3319 ret |= decompType [(int) c];
3322 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3324 if (isUppercase [(int) c]) // DerivedCoreProperties
3334 static bool IsIgnorable (int i)
3336 if (unicodeAge [i] >= 3.1)
3338 switch (char.GetUnicodeCategory ((char) i)) {
3339 case UnicodeCategory.OtherNotAssigned:
3340 case UnicodeCategory.Format:
3347 // FIXME: In the future use DerivedAge.txt to examine character
3348 // versions and set those ones that have higher version than
3349 // 1.0 as ignorable.
3350 static bool IsIgnorable (int i)
3354 // I guess, those characters are added between
3355 // Unicode 1.0 (LCMapString) and Unicode 3.1
3356 // (UnicodeCategory), so they used to be
3357 // something like OtherNotAssigned as of Unicode 1.1.
3358 case 0x2df: case 0x387:
3359 case 0x3d7: case 0x3d8: case 0x3d9:
3360 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3361 case 0x400: case 0x40d: case 0x450: case 0x45d:
3362 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3363 case 0x653: case 0x654: case 0x655: case 0x66d:
3365 case 0x1e9b: case 0x202f: case 0x20ad:
3366 case 0x20ae: case 0x20af:
3367 case 0x20e2: case 0x20e3:
3368 case 0x2139: case 0x213a: case 0x2183:
3369 case 0x2425: case 0x2426: case 0x2619:
3370 case 0x2670: case 0x2671: case 0x3007:
3371 case 0x3190: case 0x3191:
3372 case 0xfffc: case 0xfffd:
3374 // exceptional characters filtered by the
3375 // following conditions. Originally those exceptional
3376 // ranges are incorrect (they should not be ignored)
3377 // and most of those characters are unfortunately in
3379 case 0x4d8: case 0x4d9:
3380 case 0x4e8: case 0x4e9:
3382 case 0x3036: case 0x303f:
3383 case 0x337b: case 0xfb1e:
3388 // The whole Sinhala characters.
3389 0x0D82 <= i && i <= 0x0DF4
3390 // The whole Tibetan characters.
3391 || 0x0F00 <= i && i <= 0x0FD1
3392 // The whole Myanmar characters.
3393 || 0x1000 <= i && i <= 0x1059
3394 // The whole Etiopic, Cherokee,
3395 // Canadian Syllablic, Ogham, Runic,
3396 // Tagalog, Hanunoo, Philippine,
3397 // Buhid, Tagbanwa, Khmer and Mongorian
3399 || 0x1200 <= i && i <= 0x1DFF
3400 // Greek extension characters.
3401 || 0x1F00 <= i && i <= 0x1FFF
3402 // The whole Braille characters.
3403 || 0x2800 <= i && i <= 0x28FF
3404 // CJK radical characters.
3405 || 0x2E80 <= i && i <= 0x2EF3
3406 // Kangxi radical characters.
3407 || 0x2F00 <= i && i <= 0x2FD5
3408 // Ideographic description characters.
3409 || 0x2FF0 <= i && i <= 0x2FFB
3410 // Bopomofo letter and final
3411 || 0x31A0 <= i && i <= 0x31B7
3412 // White square with quadrant characters.
3413 || 0x25F0 <= i && i <= 0x25F7
3414 // Ideographic telegraph symbols.
3415 || 0x32C0 <= i && i <= 0x32CB
3416 || 0x3358 <= i && i <= 0x3370
3417 || 0x33E0 <= i && i <= 0x33FF
3418 // The whole YI characters.
3419 || 0xA000 <= i && i <= 0xA48C
3420 || 0xA490 <= i && i <= 0xA4C6
3421 // American small ligatures
3422 || 0xFB13 <= i && i <= 0xFB17
3423 // hebrew, arabic, variation selector.
3424 || 0xFB1D <= i && i <= 0xFE2F
3425 // Arabic ligatures.
3426 || 0xFEF5 <= i && i <= 0xFEFC
3427 // FIXME: why are they excluded?
3428 || 0x01F6 <= i && i <= 0x01F9
3429 || 0x0218 <= i && i <= 0x0233
3430 || 0x02A9 <= i && i <= 0x02AD
3431 || 0x02EA <= i && i <= 0x02EE
3432 || 0x0349 <= i && i <= 0x036F
3433 || 0x0488 <= i && i <= 0x048F
3434 || 0x04D0 <= i && i <= 0x04FF
3435 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3436 || 0x06D6 <= i && i <= 0x06ED
3437 || 0x06FA <= i && i <= 0x06FE
3438 || 0x2048 <= i && i <= 0x204D
3439 || 0x20e4 <= i && i <= 0x20ea
3440 || 0x213C <= i && i <= 0x214B
3441 || 0x21EB <= i && i <= 0x21FF
3442 || 0x22F2 <= i && i <= 0x22FF
3443 || 0x237B <= i && i <= 0x239A
3444 || 0x239B <= i && i <= 0x23CF
3445 || 0x24EB <= i && i <= 0x24FF
3446 || 0x2596 <= i && i <= 0x259F
3447 || 0x25F8 <= i && i <= 0x25FF
3448 || 0x2672 <= i && i <= 0x2689
3449 || 0x2768 <= i && i <= 0x2775
3450 || 0x27d0 <= i && i <= 0x27ff
3451 || 0x2900 <= i && i <= 0x2aff
3452 || 0x3033 <= i && i <= 0x303F
3453 || 0x31F0 <= i && i <= 0x31FF
3454 || 0x3250 <= i && i <= 0x325F
3455 || 0x32B1 <= i && i <= 0x32BF
3456 || 0x3371 <= i && i <= 0x337B
3457 || 0xFA30 <= i && i <= 0xFA6A
3461 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3463 case UnicodeCategory.PrivateUse:
3464 case UnicodeCategory.Surrogate:
3466 // ignored by nature
3467 case UnicodeCategory.Format:
3468 case UnicodeCategory.OtherNotAssigned:
3475 // To check IsIgnorable sanity, try the driver below under MS.NET.
3478 public static void Main ()
3480 for (int i = 0; i <= char.MaxValue; i++)
3481 Dump (i, IsIgnorable (i));
3484 static void Dump (int i, bool ignore)
3486 switch (Char.GetUnicodeCategory ((char) i)) {
3487 case UnicodeCategory.PrivateUse:
3488 case UnicodeCategory.Surrogate:
3489 return; // check nothing
3493 string s2 = new string ((char) i, 10);
3494 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3495 if ((ret == 0) == ignore)
3497 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3500 #endregion // IsIgnorable
3502 #region IsIgnorableSymbol
3503 static bool IsIgnorableSymbol (int i)
3505 if (IsIgnorable (i))
3510 case 0x00b5: case 0x01C0: case 0x01C1:
3511 case 0x01C2: case 0x01C3: case 0x01F6:
3512 case 0x01F7: case 0x01F8: case 0x01F9:
3513 case 0x02D0: case 0x02EE: case 0x037A:
3514 case 0x03D7: case 0x03F3:
3515 case 0x0400: case 0x040d:
3516 case 0x0450: case 0x045d:
3517 case 0x048C: case 0x048D:
3518 case 0x048E: case 0x048F:
3519 case 0x0587: case 0x0640: case 0x06E5:
3520 case 0x06E6: case 0x06FA: case 0x06FB:
3521 case 0x06FC: case 0x093D: case 0x0950:
3522 case 0x1E9B: case 0x2139: case 0x3006:
3523 case 0x3033: case 0x3034: case 0x3035:
3524 case 0xFE7E: case 0xFE7F:
3526 case 0x16EE: case 0x16EF: case 0x16F0:
3528 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3529 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3530 case 0x3038: // HANGZHOU NUMERAL TEN
3531 case 0x3039: // HANGZHOU NUMERAL TWENTY
3532 case 0x303a: // HANGZHOU NUMERAL THIRTY
3538 case 0x02B9: case 0x02BA: case 0x02C2:
3539 case 0x02C3: case 0x02C4: case 0x02C5:
3540 case 0x02C8: case 0x02CC: case 0x02CD:
3541 case 0x02CE: case 0x02CF: case 0x02D2:
3542 case 0x02D3: case 0x02D4: case 0x02D5:
3543 case 0x02D6: case 0x02D7: case 0x02DE:
3544 case 0x02E5: case 0x02E6: case 0x02E7:
3545 case 0x02E8: case 0x02E9:
3546 case 0x309B: case 0x309C:
3548 case 0x055A: // American Apos
3549 case 0x05C0: // Hebrew Punct
3550 case 0x0E4F: // Thai FONGMAN
3551 case 0x0E5A: // Thai ANGKHANKHU
3552 case 0x0E5B: // Thai KHOMUT
3554 case 0x09F2: // Bengali Rupee Mark
3555 case 0x09F3: // Bengali Rupee Sign
3557 case 0x221e: // INF.
3566 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3568 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3569 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3574 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3576 case UnicodeCategory.Surrogate:
3577 return false; // inconsistent
3579 case UnicodeCategory.SpacingCombiningMark:
3580 case UnicodeCategory.EnclosingMark:
3581 case UnicodeCategory.NonSpacingMark:
3582 case UnicodeCategory.PrivateUse:
3584 if (0x064B <= i && i <= 0x0652) // Arabic
3588 case UnicodeCategory.Format:
3589 case UnicodeCategory.OtherNotAssigned:
3596 // latin in a circle
3597 0x249A <= i && i <= 0x24E9
3598 || 0x2100 <= i && i <= 0x2132
3600 || 0x3196 <= i && i <= 0x31A0
3602 || 0x3200 <= i && i <= 0x321C
3604 || 0x322A <= i && i <= 0x3243
3606 || 0x3260 <= i && i <= 0x32B0
3607 || 0x32D0 <= i && i <= 0x3357
3608 || 0x337B <= i && i <= 0x33DD
3610 use = !Char.IsLetterOrDigit ((char) i);
3614 // This "Digit" rule is mystery.
3615 // It filters some symbols out.
3616 if (Char.IsLetterOrDigit ((char) i))
3618 if (Char.IsNumber ((char) i))
3620 if (Char.IsControl ((char) i)
3621 || Char.IsSeparator ((char) i)
3622 || Char.IsPunctuation ((char) i))
3624 if (Char.IsSymbol ((char) i))
3627 // FIXME: should check more
3632 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3634 public static void Main ()
3636 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3637 for (int i = 0; i <= char.MaxValue; i++) {
3638 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3639 if (uc == UnicodeCategory.Surrogate)
3642 bool ret = IsIgnorableSymbol (i);
3644 string s1 = "TEST ";
3645 string s2 = "TEST " + (char) i;
3647 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3649 if (ret != (result == 0))
3650 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3651 ret ? "should not ignore" :
3660 static bool IsIgnorableNonSpacing (int i)
3662 if (IsIgnorable (i))
3666 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3667 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3668 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3670 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3671 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3672 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3673 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3674 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3675 case 0x0CCD: case 0x0E4E:
3679 if (0x02b9 <= i && i <= 0x02c5
3680 || 0x02cc <= i && i <= 0x02d7
3681 || 0x02e4 <= i && i <= 0x02ef
3682 || 0x20DD <= i && i <= 0x20E0
3686 if (0x064B <= i && i <= 0x00652
3687 || 0x0941 <= i && i <= 0x0948
3688 || 0x0AC1 <= i && i <= 0x0ACD
3689 || 0x0C3E <= i && i <= 0x0C4F
3690 || 0x0E31 <= i && i <= 0x0E3F
3694 return Char.GetUnicodeCategory ((char) i) ==
3695 UnicodeCategory.NonSpacingMark;
3698 // We can reuse IsIgnorableSymbol testcode
3699 // for IsIgnorableNonSpacing.
3705 public byte Category;
3707 public byte Level2; // It is always single byte.
3708 public bool Defined;
3710 public CharMapEntry (byte category, byte level1, byte level2)
3712 Category = category;
3721 public readonly int CP;
3722 public readonly int JIS;
3724 public JISCharacter (int cp, int cpJIS)
3731 class JISComparer : IComparer
3733 public static readonly JISComparer Instance =
3736 public int Compare (object o1, object o2)
3738 JISCharacter j1 = (JISCharacter) o1;
3739 JISCharacter j2 = (JISCharacter) o2;
3740 return j1.JIS - j2.JIS;
3744 class NonJISCharacter
3746 public readonly int CP;
3747 public readonly string Name;
3749 public NonJISCharacter (int cp, string name)
3756 class NonJISComparer : IComparer
3758 public static readonly NonJISComparer Instance =
3759 new NonJISComparer ();
3761 public int Compare (object o1, object o2)
3763 NonJISCharacter j1 = (NonJISCharacter) o1;
3764 NonJISCharacter j2 = (NonJISCharacter) o2;
3765 return string.CompareOrdinal (j1.Name, j2.Name);
3769 class DecimalDictionaryValueComparer : IComparer
3771 public static readonly DecimalDictionaryValueComparer Instance
3772 = new DecimalDictionaryValueComparer ();
3774 private DecimalDictionaryValueComparer ()
3778 public int Compare (object o1, object o2)
3780 DictionaryEntry e1 = (DictionaryEntry) o1;
3781 DictionaryEntry e2 = (DictionaryEntry) o2;
3782 // FIXME: in case of 0, compare decomposition categories
3783 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3786 int i1 = (int) e1.Key;
3787 int i2 = (int) e2.Key;
3792 class StringDictionaryValueComparer : IComparer
3794 public static readonly StringDictionaryValueComparer Instance
3795 = new StringDictionaryValueComparer ();
3797 private StringDictionaryValueComparer ()
3801 public int Compare (object o1, object o2)
3803 DictionaryEntry e1 = (DictionaryEntry) o1;
3804 DictionaryEntry e2 = (DictionaryEntry) o2;
3805 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3808 int i1 = (int) e1.Key;
3809 int i2 = (int) e2.Key;
3814 class UCAComparer : IComparer
3816 public static readonly UCAComparer Instance
3817 = new UCAComparer ();
3819 private UCAComparer ()
3823 public int Compare (object o1, object o2)
3825 char i1 = (char) o1;
3826 char i2 = (char) o2;
3828 int l1 = CollationElementTable.GetSortKeyCount (i1);
3829 int l2 = CollationElementTable.GetSortKeyCount (i2);
3830 int l = l1 > l2 ? l2 : l1;
3832 for (int i = 0; i < l; i++) {
3833 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3834 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3835 int v = k1.Primary - k2.Primary;
3838 v = k1.Secondary - k2.Secondary;
3841 v = k1.Thirtiary - k2.Thirtiary;
3844 v = k1.Quarternary - k2.Quarternary;
3857 ArrayList items = new ArrayList ();
3859 public Tailoring (int lcid)
3864 public Tailoring (int lcid, int alias)
3871 get { return lcid; }
3875 get { return alias; }
3878 public bool FrenchSort {
3879 get { return frenchSort; }
3880 set { frenchSort = value; }
3883 public void AddDiacriticalMap (byte target, byte replace)
3885 items.Add (new DiacriticalMap (target, replace));
3888 public void AddSortKeyMap (string source, byte [] sortkey)
3890 items.Add (new SortKeyMap (source, sortkey));
3893 public void AddReplacementMap (string source, string replace)
3895 items.Add (new ReplacementMap (source, replace));
3898 public char [] ItemToCharArray ()
3900 ArrayList al = new ArrayList ();
3901 foreach (ITailoringMap m in items)
3902 al.AddRange (m.ToCharArray ());
3903 return al.ToArray (typeof (char)) as char [];
3906 interface ITailoringMap
3908 char [] ToCharArray ();
3911 class DiacriticalMap : ITailoringMap
3913 public readonly byte Target;
3914 public readonly byte Replace;
3916 public DiacriticalMap (byte target, byte replace)
3922 public char [] ToCharArray ()
3924 char [] ret = new char [3];
3925 ret [0] = (char) 02; // kind:DiacriticalMap
3926 ret [1] = (char) Target;
3927 ret [2] = (char) Replace;
3932 class SortKeyMap : ITailoringMap
3934 public readonly string Source;
3935 public readonly byte [] SortKey;
3937 public SortKeyMap (string source, byte [] sortkey)
3943 public char [] ToCharArray ()
3945 char [] ret = new char [Source.Length + 7];
3946 ret [0] = (char) 01; // kind:SortKeyMap
3947 for (int i = 0; i < Source.Length; i++)
3948 ret [i + 1] = Source [i];
3950 for (int i = 0; i < 4; i++)
3951 ret [i + Source.Length + 2] = (char) SortKey [i];
3956 class ReplacementMap : ITailoringMap
3958 public readonly string Source;
3959 public readonly string Replace;
3961 public ReplacementMap (string source, string replace)
3967 public char [] ToCharArray ()
3969 char [] ret = new char [Source.Length + Replace.Length + 3];
3970 ret [0] = (char) 03; // kind:ReplaceMap
3972 for (int i = 0; i < Source.Length; i++)
3973 ret [pos++] = Source [i];
3976 for (int i = 0; i < Replace.Length; i++)
3977 ret [pos++] = Replace [i];