3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
102 "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
104 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
105 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
106 "WITH OGONEK;", "WITH CEDILLA;",
108 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
109 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
111 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
112 " DIAERESIS AND GRAVE;",
114 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
115 " MACRON AND ACUTE;",
116 " MACRON AND GRAVE;",
118 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
119 " RING ABOVE AND ACUTE",
120 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
121 " CIRCUMFLEX AND TILDE",
122 " TILDE AND DIAERESIS",
125 " CEDILLA AND BREVE",
126 " OGONEK AND MACRON",
129 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
133 " PRECEDED BY APOSTROPHE",
135 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
138 " RETROFLEX;", "DIAERESIS BELOW",
141 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
142 " BREVE BELOW;", " HORN AND GRAVE",
145 " DOT BELOW AND DOT ABOVE",
146 " RIGHT HALF RING", " HORN AND TILDE",
147 " CIRCUMFLEX AND DOT BELOW",
148 " BREVE AND DOT BELOW",
149 " DOT BELOW AND MACRON",
151 " HORN AND HOOK ABOVE",
153 // CIRCLED, PARENTHESIZED and so on
154 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
155 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
156 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
158 byte [] diacriticWeights = new byte [] {
162 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
163 0x17, 0x19, 0x1A, 0x1B, 0x1C,
165 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
166 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
168 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
169 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
171 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
172 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
174 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
175 0x69, 0x69, 0x6A, 0x6D, 0x6E,
177 // CIRCLED, PARENTHESIZED and so on.
178 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
182 int [] numberSecondaryWeightBounds = new int [] {
183 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
184 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
185 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
186 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
187 0xE50, 0xE60, 0xED0, 0xEE0
190 char [] orderedGurmukhi;
191 char [] orderedGujarati;
192 char [] orderedGeorgian;
193 char [] orderedThaana;
195 static readonly char [] orderedTamilConsonants = new char [] {
196 // based on traditional Tamil consonants, except for
197 // Grantha (where Microsoft breaks traditionalism).
198 // http://www.angelfire.com/empire/thamizh/padanGaL
199 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
200 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
201 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
202 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
205 // cp -> character name (only for some characters)
206 ArrayList sortableCharNames = new ArrayList ();
208 // cp -> arrow value (int)
209 ArrayList arrowValues = new ArrayList ();
211 // cp -> box value (int)
212 ArrayList boxValues = new ArrayList ();
214 // cp -> level1 value
215 Hashtable arabicLetterPrimaryValues = new Hashtable ();
218 Hashtable arabicNameMap = new Hashtable ();
220 // cp -> Hashtable [decompType] -> cp
221 Hashtable nfkdMap = new Hashtable ();
223 // Latin letter -> ArrayList [int]
224 Hashtable latinMap = new Hashtable ();
226 ArrayList jisJapanese = new ArrayList ();
227 ArrayList nonJisJapanese = new ArrayList ();
229 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
230 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
231 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
232 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
233 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
235 byte [] ignorableFlags = new byte [char.MaxValue + 1];
237 static double [] unicodeAge = new double [char.MaxValue + 1];
239 ArrayList tailorings = new ArrayList ();
241 void Run (string [] args)
243 string dirname = args.Length == 0 ? "downloaded" : args [0];
244 ParseSources (dirname);
245 Console.Error.WriteLine ("parse done.");
247 ModifyParsedValues ();
249 Console.Error.WriteLine ("generation done.");
251 Console.Error.WriteLine ("serialization done.");
253 StreamWriter sw = new StreamWriter ("agelog.txt");
254 for (int i = 0; i < char.MaxValue; i++) {
255 bool shouldBe = false;
256 switch (Char.GetUnicodeCategory ((char) i)) {
257 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
258 shouldBe = true; break;
260 if (unicodeAge [i] >= 3.1)
262 //if (IsIgnorable (i) != shouldBe)
263 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
269 byte [] CompressArray (byte [] source, CodePointIndexer i)
271 return (byte []) CodePointIndexer.CompressArray (
272 source, typeof (byte), i);
275 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
277 return (ushort []) CodePointIndexer.CompressArray (
278 source, typeof (ushort), i);
284 SerializeTailorings ();
286 byte [] categories = new byte [map.Length];
287 byte [] level1 = new byte [map.Length];
288 byte [] level2 = new byte [map.Length];
289 byte [] level3 = new byte [map.Length];
290 ushort [] widthCompat = new ushort [map.Length];
291 for (int i = 0; i < map.Length; i++) {
292 categories [i] = map [i].Category;
293 level1 [i] = map [i].Level1;
294 level2 [i] = map [i].Level2;
295 level3 [i] = ComputeLevel3Weight ((char) i);
296 switch (decompType [i]) {
297 case DecompositionNarrow:
298 case DecompositionWide:
299 case DecompositionSuper:
300 case DecompositionSub:
301 // they are always 1 char
302 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
308 ignorableFlags = CompressArray (ignorableFlags,
309 MSCompatUnicodeTableUtil.Ignorable);
310 categories = CompressArray (categories,
311 MSCompatUnicodeTableUtil.Category);
312 level1 = CompressArray (level1,
313 MSCompatUnicodeTableUtil.Level1);
314 level2 = CompressArray (level2,
315 MSCompatUnicodeTableUtil.Level2);
316 level3 = CompressArray (level3,
317 MSCompatUnicodeTableUtil.Level3);
318 widthCompat = (ushort []) CodePointIndexer.CompressArray (
319 widthCompat, typeof (ushort),
320 MSCompatUnicodeTableUtil.WidthCompat);
321 cjkCHS = CompressArray (cjkCHS,
322 MSCompatUnicodeTableUtil.CjkCHS);
323 cjkCHT = CompressArray (cjkCHT,
324 MSCompatUnicodeTableUtil.Cjk);
325 cjkJA = CompressArray (cjkJA,
326 MSCompatUnicodeTableUtil.Cjk);
327 cjkKO = CompressArray (cjkKO,
328 MSCompatUnicodeTableUtil.Cjk);
329 cjkKOlv2 = CompressArray (cjkKOlv2,
330 MSCompatUnicodeTableUtil.Cjk);
333 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
335 MemoryStream ms = new MemoryStream ();
336 BinaryWriter binary = new BinaryWriter (ms);
337 binary.Write (ignorableFlags.Length);
339 for (int i = 0; i < ignorableFlags.Length; i++) {
340 byte value = ignorableFlags [i];
342 Result.Write ("{0},", value);
344 Result.Write ("0x{0:X02},", value);
346 binary.Write (value);
348 if ((i & 0xF) == 0xF)
349 Result.WriteLine ("// {0:X04}", i - 0xF);
351 Result.WriteLine ("};");
355 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
357 binary.Write (categories.Length);
359 for (int i = 0; i < categories.Length; i++) {
360 byte value = categories [i];
362 Result.Write ("{0},", value);
364 Result.Write ("0x{0:X02},", value);
366 binary.Write (value);
368 if ((i & 0xF) == 0xF)
369 Result.WriteLine ("// {0:X04}", i - 0xF);
371 Result.WriteLine ("};");
374 // Primary weight value
375 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
377 binary.Write (level1.Length);
379 for (int i = 0; i < level1.Length; i++) {
380 byte value = level1 [i];
382 Result.Write ("{0},", value);
384 Result.Write ("0x{0:X02},", value);
386 binary.Write (value);
388 if ((i & 0xF) == 0xF)
389 Result.WriteLine ("// {0:X04}", i - 0xF);
391 Result.WriteLine ("};");
395 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
397 binary.Write (level2.Length);
399 for (int i = 0; i < level2.Length; i++) {
400 byte value = level2 [i];
402 Result.Write ("{0},", value);
404 Result.Write ("0x{0:X02},", value);
406 binary.Write (value);
408 if ((i & 0xF) == 0xF)
409 Result.WriteLine ("// {0:X04}", i - 0xF);
411 Result.WriteLine ("};");
415 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
417 binary.Write (level3.Length);
419 for (int i = 0; i < level3.Length; i++) {
420 byte value = level3 [i];
422 Result.Write ("{0},", value);
424 Result.Write ("0x{0:X02},", value);
426 binary.Write (value);
428 if ((i & 0xF) == 0xF)
429 Result.WriteLine ("// {0:X04}", i - 0xF);
431 Result.WriteLine ("};");
434 // Width insensitivity mappings
435 // (for now it is more lightweight than dumping the
436 // entire NFKD table).
437 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
439 binary.Write (widthCompat.Length);
441 for (int i = 0; i < widthCompat.Length; i++) {
442 ushort value = widthCompat [i];
444 Result.Write ("{0},", value);
446 Result.Write ("0x{0:X02},", value);
448 binary.Write (value);
450 if ((i & 0xF) == 0xF)
451 Result.WriteLine ("// {0:X04}", i - 0xF);
453 Result.WriteLine ("};");
456 using (FileStream fs = File.Create ("../collation.core.bin")) {
457 byte [] array = ms.ToArray ();
458 fs.Write (array, 0, array.Length);
463 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
464 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
465 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
466 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
467 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
470 void SerializeCJK (string name, ushort [] cjk, int max)
472 int offset = 0;//char.MaxValue - cjk.Length;
473 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
475 MemoryStream ms = new MemoryStream ();
476 BinaryWriter binary = new BinaryWriter (ms);
477 binary.Write (cjk.Length);
479 for (int i = 0; i < cjk.Length; i++) {
480 if (i + offset == max)
482 ushort value = cjk [i];
484 Result.Write ("{0},", value);
486 Result.Write ("0x{0:X04},", value);
488 binary.Write (value);
490 if ((i & 0xF) == 0xF)
491 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
493 Result.WriteLine ("};");
496 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
497 byte [] array = ms.ToArray ();
498 fs.Write (array, 0, array.Length);
503 void SerializeCJK (string name, byte [] cjk, int max)
505 int offset = 0;//char.MaxValue - cjk.Length;
506 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
508 MemoryStream ms = new MemoryStream ();
509 BinaryWriter binary = new BinaryWriter (ms);
511 for (int i = 0; i < cjk.Length; i++) {
512 if (i + offset == max)
514 byte value = cjk [i];
516 Result.Write ("{0},", value);
518 Result.Write ("0x{0:X02},", value);
520 binary.Write (value);
522 if ((i & 0xF) == 0xF)
523 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
525 Result.WriteLine ("};");
528 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
529 byte [] array = ms.ToArray ();
530 fs.Write (array, 0, array.Length);
535 void SerializeTailorings ()
537 Hashtable indexes = new Hashtable ();
538 Hashtable counts = new Hashtable ();
539 Result.WriteLine ("static char [] tailorings = new char [] {");
542 MemoryStream ms = new MemoryStream ();
543 BinaryWriter binary = new BinaryWriter (ms);
545 foreach (Tailoring t in tailorings) {
548 Result.Write ("/*{0}*/", t.LCID);
549 indexes.Add (t.LCID, count);
550 char [] values = t.ItemToCharArray ();
551 counts.Add (t.LCID, values.Length);
552 foreach (char c in values) {
553 Result.Write ("'\\x{0:X}', ", (int) c);
554 if (++count % 16 == 0)
555 Result.WriteLine (" // {0:X04}", count - 16);
557 binary.Write ((ushort) c);
561 Result.WriteLine ("};");
563 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
565 byte [] rawdata = ms.ToArray ();
566 ms = new MemoryStream ();
567 binary = new BinaryWriter (ms);
568 binary.Write (tailorings.Count);
570 foreach (Tailoring t in tailorings) {
571 int target = t.Alias != 0 ? t.Alias : t.LCID;
572 if (!indexes.ContainsKey (target)) {
573 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
576 int idx = (int) indexes [target];
577 int cnt = (int) counts [target];
578 bool french = t.FrenchSort;
580 foreach (Tailoring t2 in tailorings)
581 if (t2.LCID == t.LCID)
582 french = t2.FrenchSort;
583 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
585 binary.Write (t.LCID);
588 binary.Write (french);
591 Result.WriteLine ("};");
593 binary.Write ((byte) 0xFF);
594 binary.Write ((byte) 0xFF);
595 binary.Write (rawdata.Length / 2);
596 binary.Write (rawdata, 0, rawdata.Length);
599 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
600 byte [] array = ms.ToArray ();
601 fs.Write (array, 0, array.Length);
608 void ParseSources (string dirname)
611 dirname + "/UnicodeData.txt";
612 string derivedCoreProps =
613 dirname + "/DerivedCoreProperties.txt";
615 dirname + "/Scripts.txt";
617 dirname + "/CP932.TXT";
619 dirname + "/DerivedAge.txt";
620 string chXML = dirname + "/common/collation/zh.xml";
621 string jaXML = dirname + "/common/collation/ja.xml";
622 string koXML = dirname + "/common/collation/ko.xml";
624 ParseDerivedAge (derivedAge);
628 ParseJISOrder (cp932); // in prior to ParseUnidata()
629 ParseUnidata (unidata);
631 ParseDerivedCoreProperties (derivedCoreProps);
632 ParseScripts (scripts);
633 ParseCJK (chXML, jaXML, koXML);
635 ParseTailorings ("mono-tailoring-source.txt");
638 void ParseTailorings (string filename)
642 using (StreamReader sr = new StreamReader (filename)) {
644 while (sr.Peek () >= 0) {
646 ProcessTailoringLine (ref t,
647 sr.ReadLine ().Trim ());
649 } catch (Exception) {
650 Console.Error.WriteLine ("ERROR at line {0}", line);
656 // For now this is enough.
657 string ParseTailoringSourceValue (string s)
659 StringBuilder sb = new StringBuilder ();
660 for (int i = 0; i < s.Length; i++) {
661 if (s.StartsWith ("\\u")) {
662 sb.Append ((char) int.Parse (
663 s.Substring (2, 4), NumberStyles.HexNumber),
670 return sb.ToString ();
673 void ProcessTailoringLine (ref Tailoring t, string s)
675 int idx = s.IndexOf ('#');
677 s = s.Substring (0, idx).Trim ();
678 if (s.Length == 0 || s [0] == '#')
681 idx = s.IndexOf ('=');
684 int.Parse (s.Substring (1, idx - 1)),
685 int.Parse (s.Substring (idx + 1)));
687 t = new Tailoring (int.Parse (s.Substring (1)));
691 if (s.StartsWith ("*FrenchSort")) {
695 string d = "*Diacritical";
696 if (s.StartsWith (d)) {
697 idx = s.IndexOf ("->");
698 t.AddDiacriticalMap (
699 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
700 NumberStyles.HexNumber),
701 byte.Parse (s.Substring (idx + 2).Trim (),
702 NumberStyles.HexNumber));
705 idx = s.IndexOf (':');
707 string source = s.Substring (0, idx).Trim ();
708 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
709 byte [] b = new byte [4];
710 for (int i = 0; i < 4; i++) {
714 b [i] = byte.Parse (l [i],
715 NumberStyles.HexNumber);
717 t.AddSortKeyMap (ParseTailoringSourceValue (source),
720 idx = s.IndexOf ('=');
722 t.AddReplacementMap (
723 ParseTailoringSourceValue (
724 s.Substring (0, idx).Trim ()),
725 ParseTailoringSourceValue (
726 s.Substring (idx + 1).Trim ()));
729 void ParseDerivedAge (string filename)
731 using (StreamReader file =
732 new StreamReader (filename)) {
733 while (file.Peek () >= 0) {
734 string s = file.ReadLine ();
735 int idx = s.IndexOf ('#');
737 s = s.Substring (0, idx);
738 idx = s.IndexOf (';');
742 string cpspec = s.Substring (0, idx);
743 idx = cpspec.IndexOf ("..");
744 NumberStyles nf = NumberStyles.HexNumber |
745 NumberStyles.AllowTrailingWhite;
746 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
747 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
748 string value = s.Substring (cpspec.Length + 1).Trim ();
751 if (cp > char.MaxValue)
754 double v = double.Parse (value);
755 for (int i = cp; i <= cpEnd; i++)
759 unicodeAge [0] = double.MaxValue; // never be supported
762 void ParseUnidata (string filename)
764 ArrayList decompValues = new ArrayList ();
765 using (StreamReader unidata =
766 new StreamReader (filename)) {
767 for (int line = 1; unidata.Peek () >= 0; line++) {
769 ProcessUnidataLine (unidata.ReadLine (), decompValues);
770 } catch (Exception) {
771 Console.Error.WriteLine ("**** At line " + line);
776 this.decompValues = (int [])
777 decompValues.ToArray (typeof (int));
780 char previousLatinTarget = char.MinValue;
781 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
783 void ProcessUnidataLine (string s, ArrayList decompValues)
785 int idx = s.IndexOf ('#');
787 s = s.Substring (0, idx);
788 idx = s.IndexOf (';');
791 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
792 string [] values = s.Substring (idx + 1).Split (';');
795 if (cp > char.MaxValue)
797 if (IsIgnorable (cp))
800 string name = values [0];
802 // SPECIAL CASE: rename some characters for diacritical
803 // remapping. FIXME: why are they different?
804 // FIXME: it's still not working.
805 if (cp == 0x018B || cp == 0x018C)
806 name = name.Replace ("TOPBAR", "STROKE");
809 if (s.IndexOf ("SMALL CAPITAL") > 0)
810 isSmallCapital [cp] = true;
812 // latin mapping by character name
813 if (s.IndexOf ("LATIN") >= 0) {
814 int lidx = s.IndexOf ("LETTER DOTLESS ");
815 int offset = lidx + 15;
817 lidx = s.IndexOf ("LETTER TURNED ");
821 lidx = s.IndexOf ("LETTER CAPITAL ");
825 lidx = s.IndexOf ("LETTER SCRIPT ");
829 lidx = s.IndexOf ("LETTER ");
832 char c = lidx > 0 ? s [offset] : char.MinValue;
833 char n = s [offset + 1];
834 char target = char.MinValue;
835 if ('A' <= c && c <= 'Z' &&
836 (n == ' ') || n == ';') {
838 // FIXME: After 'Z', I cannot reset this state.
839 previousLatinTarget = c == 'Z' ? char.MinValue : c;
842 if (s.Substring (offset).StartsWith ("ALPHA"))
844 else if (s.Substring (offset).StartsWith ("TONE SIX"))
846 else if (s.Substring (offset).StartsWith ("OPEN O"))
848 else if (s.Substring (offset).StartsWith ("SCHWA"))
850 else if (s.Substring (offset).StartsWith ("ENG"))
852 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
854 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
856 else if (s.Substring (offset).StartsWith ("TONE TWO"))
858 else if (s.Substring (offset).StartsWith ("ESH"))
861 if (target == char.MinValue)
862 target = previousLatinTarget;
864 if (target != char.MinValue) {
865 ArrayList entry = (ArrayList) latinMap [target];
867 entry = new ArrayList ();
868 latinMap [target] = entry;
871 // FIXME: This secondary weight is hack.
872 // They are here because they must not
873 // be identical to the corresponding
875 if (c != target && diacritical [cp] == 0) {
876 diacriticalOffset [c - 'A']++;
877 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
883 if (0x2000 <= cp && cp < 0x3000) {
885 // SPECIAL CASES. FIXME: why?
887 case 0x21C5: value = -1; break; // E2
888 case 0x261D: value = 1; break;
889 case 0x27A6: value = 3; break;
890 case 0x21B0: value = 7; break;
891 case 0x21B1: value = 3; break;
892 case 0x21B2: value = 7; break;
893 case 0x21B4: value = 5; break;
894 case 0x21B5: value = 7; break;
895 case 0x21B9: value = -1; break; // E1
896 case 0x21CF: value = 7; break;
897 case 0x21D0: value = 3; break;
899 string [] arrowTargets = new string [] {
911 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
912 if (s.IndexOf (arrowTargets [i]) > 0 &&
913 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
914 s.IndexOf (" OVER") < 0
918 arrowValues.Add (new DictionaryEntry (
923 if (0x2500 <= cp && cp < 0x2600) {
926 // up:1 down:2 right:4 left:8 vert:16 horiz:32
929 // [dr] [dl] [ur] [ul]
933 ArrayList flags = new ArrayList (new int [] {
936 4 + 2, 8 + 2, 4 + 1, 8 + 1,
937 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
938 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
939 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
941 byte [] offsets = new byte [] {
948 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
950 if (s.IndexOf (" UP") >= 0)
952 if (s.IndexOf (" DOWN") >= 0)
954 if (s.IndexOf (" RIGHT") >= 0)
956 if (s.IndexOf (" LEFT") >= 0)
958 if (s.IndexOf (" VERTICAL") >= 0)
960 if (s.IndexOf (" HORIZONTAL") >= 0)
963 int fidx = flags.IndexOf (flag);
964 value = fidx < 0 ? fidx : offsets [fidx];
965 } else if (s.IndexOf ("BLOCK") >= 0) {
966 if (s.IndexOf ("ONE EIGHTH") >= 0)
968 else if (s.IndexOf ("ONE QUARTER") >= 0)
970 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
972 else if (s.IndexOf ("HALF") >= 0)
974 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
976 else if (s.IndexOf ("THREE QUARTERS") >= 0)
978 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
983 else if (s.IndexOf ("SHADE") >= 0)
985 else if (s.IndexOf ("SQUARE") >= 0)
987 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
989 else if (s.IndexOf ("RECTANGLE") >= 0)
991 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
993 else if (s.IndexOf ("TRIANGLE") >= 0) {
994 if (s.IndexOf ("UP-POINTING") >= 0)
996 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
998 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1000 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1001 value = 0xC3 - 0xE5;
1003 else if (s.IndexOf ("POINTER") >= 0) {
1004 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1005 value = 0xC4 - 0xE5;
1006 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1007 value = 0xC5 - 0xE5;
1009 else if (s.IndexOf ("DIAMOND") >= 0)
1010 value = 0xC6 - 0xE5;
1011 else if (s.IndexOf ("FISHEYE") >= 0)
1012 value = 0xC7 - 0xE5;
1013 else if (s.IndexOf ("LOZENGE") >= 0)
1014 value = 0xC8 - 0xE5;
1015 else if (s.IndexOf ("BULLSEYE") >= 0)
1016 value = 0xC9 - 0xE5;
1017 else if (s.IndexOf ("CIRCLE") >= 0) {
1018 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1019 value = 0xCA - 0xE5;
1020 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1021 value = 0xCB - 0xE5;
1023 value = 0xC9 - 0xE5;
1025 if (0x25DA <= cp && cp <= 0x25E5)
1026 value = 0xCD + cp - 0x25DA - 0xE5;
1028 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1030 case 0x2571: value = 0xF; break;
1031 case 0x2572: value = 0x10; break;
1032 case 0x2573: value = 0x11; break;
1035 boxValues.Add (new DictionaryEntry (
1039 // For some characters store the name and sort later
1040 // to determine sorting.
1041 if (0x2100 <= cp && cp <= 0x213F &&
1042 Char.IsSymbol ((char) cp))
1043 sortableCharNames.Add (
1044 new DictionaryEntry (cp, name));
1045 else if (0x3380 <= cp && cp <= 0x33DD)
1046 sortableCharNames.Add (new DictionaryEntry (
1047 cp, name.Substring (7)));
1049 if (Char.GetUnicodeCategory ((char) cp) ==
1050 UnicodeCategory.MathSymbol) {
1051 if (name.StartsWith ("CIRCLED "))
1052 diacritical [cp] = 0xEE;
1053 if (name.StartsWith ("SQUARED "))
1054 diacritical [cp] = 0xEF;
1057 // diacritical weights by character name
1058 if (diacritics.Length != diacriticWeights.Length)
1059 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1060 for (int d = 0; d < diacritics.Length; d++) {
1061 if (s.IndexOf (diacritics [d]) > 0) {
1062 diacritical [cp] += diacriticWeights [d];
1063 if (s.IndexOf ("COMBINING") >= 0)
1064 diacritical [cp] -= (byte) 2;
1067 // also process "COMBINING blah" here
1068 // For now it is limited to cp < 0x0370
1069 // if (cp < 0x0300 || cp >= 0x0370)
1071 string tmp = diacritics [d].TrimEnd (';');
1072 if (tmp.IndexOf ("WITH ") == 0)
1073 tmp = tmp.Substring (4);
1074 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1076 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1078 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1080 // Two-step grep required for it.
1081 if (s.IndexOf ("FULL STOP") > 0 &&
1082 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1083 diacritical [cp] |= 0xF4;
1085 // Arabic letter name
1086 if (0x0621 <= cp && cp <= 0x064A &&
1087 Char.GetUnicodeCategory ((char) cp)
1088 == UnicodeCategory.OtherLetter) {
1089 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1094 // hamza, waw, yeh ... special cases.
1099 value = 0x77; // special cases.
1102 // Get primary letter name i.e.
1103 // XXX part of ARABIC LETTER XXX yyy
1104 // e.g. that of "TEH MARBUTA" is "TEH".
1107 // 0x0640 is special: it does
1108 // not start with ARABIC LETTER
1110 name.Substring (14);
1111 int tmpIdx = letterName.IndexOf (' ');
1112 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1113 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1114 if (arabicNameMap.ContainsKey (letterName))
1115 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1117 arabicNameMap [letterName] = cp;
1120 arabicLetterPrimaryValues [cp] = value;
1123 // Japanese square letter
1124 if (0x3300 <= cp && cp <= 0x3357)
1125 if (!ExistsJIS (cp))
1126 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1128 // normalizationType
1129 string decomp = values [4];
1130 idx = decomp.IndexOf ('<');
1132 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1134 decompType [cp] = DecompositionFull;
1137 decompType [cp] = DecompositionSub;
1140 decompType [cp] = DecompositionSuper;
1143 decompType [cp] = DecompositionSmall;
1146 decompType [cp] = DecompositionIsolated;
1149 decompType [cp] = DecompositionInitial;
1152 decompType [cp] = DecompositionFinal;
1155 decompType [cp] = DecompositionMedial;
1158 decompType [cp] = DecompositionNoBreak;
1161 decompType [cp] = DecompositionCompat;
1164 decompType [cp] = DecompositionFraction;
1167 decompType [cp] = DecompositionFont;
1170 decompType [cp] = DecompositionCircle;
1173 decompType [cp] = DecompositionSquare;
1176 decompType [cp] = DecompositionWide;
1179 decompType [cp] = DecompositionNarrow;
1182 decompType [cp] = DecompositionVertical;
1185 throw new Exception ("Support NFKD type : " + decomp);
1189 decompType [cp] = DecompositionCanonical;
1190 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1191 if (decomp.Length > 0) {
1193 string [] velems = decomp.Split (' ');
1194 int didx = decompValues.Count;
1195 decompIndex [cp] = didx;
1196 foreach (string v in velems)
1197 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1198 decompLength [cp] = velems.Length;
1200 // [decmpType] -> this_cp
1201 int targetCP = (int) decompValues [didx];
1202 // for "(x)" it specially maps to 'x' .
1203 // FIXME: check if it is sane
1204 if (velems.Length == 3 &&
1205 (int) decompValues [didx] == '(' &&
1206 (int) decompValues [didx + 2] == ')')
1207 targetCP = (int) decompValues [didx + 1];
1208 // special: 0x215F "1/"
1209 else if (cp == 0x215F)
1211 else if (velems.Length > 1 &&
1212 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1213 // skip them, except for CJK ideograph compat
1216 if (targetCP != 0) {
1217 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1218 if (entry == null) {
1219 entry = new Hashtable ();
1220 nfkdMap [targetCP] = entry;
1222 entry [(byte) decompType [cp]] = cp;
1226 if (values [5].Length > 0)
1227 decimalValue [cp] = decimal.Parse (values [5]);
1228 else if (values [6].Length > 0)
1229 decimalValue [cp] = decimal.Parse (values [6]);
1230 else if (values [7].Length > 0) {
1231 string decstr = values [7];
1232 idx = decstr.IndexOf ('/');
1233 if (cp == 0x215F) // special. "1/"
1234 decimalValue [cp] = 0x1;
1238 decimal.Parse (decstr.Substring (0, idx))
1239 / decimal.Parse (decstr.Substring (idx + 1));
1240 else if (decstr [0] == '(' &&
1241 decstr [decstr.Length - 1] == ')')
1244 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1245 else if (decstr [decstr.Length - 1] == '.')
1248 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1250 decimalValue [cp] = decimal.Parse (decstr);
1254 void ParseDerivedCoreProperties (string filename)
1257 using (StreamReader file =
1258 new StreamReader (filename)) {
1259 for (int line = 1; file.Peek () >= 0; line++) {
1261 ProcessDerivedCorePropLine (file.ReadLine ());
1262 } catch (Exception) {
1263 Console.Error.WriteLine ("**** At line " + line);
1270 void ProcessDerivedCorePropLine (string s)
1272 int idx = s.IndexOf ('#');
1274 s = s.Substring (0, idx);
1275 idx = s.IndexOf (';');
1278 string cpspec = s.Substring (0, idx);
1279 idx = cpspec.IndexOf ("..");
1280 NumberStyles nf = NumberStyles.HexNumber |
1281 NumberStyles.AllowTrailingWhite;
1282 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1283 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1284 string value = s.Substring (cpspec.Length + 1).Trim ();
1287 if (cp > char.MaxValue)
1292 for (int x = cp; x <= cpEnd; x++)
1293 isUppercase [x] = true;
1298 void ParseScripts (string filename)
1300 ArrayList gurmukhi = new ArrayList ();
1301 ArrayList gujarati = new ArrayList ();
1302 ArrayList georgian = new ArrayList ();
1303 ArrayList thaana = new ArrayList ();
1305 using (StreamReader file =
1306 new StreamReader (filename)) {
1307 while (file.Peek () >= 0) {
1308 string s = file.ReadLine ();
1309 int idx = s.IndexOf ('#');
1311 s = s.Substring (0, idx);
1312 idx = s.IndexOf (';');
1316 string cpspec = s.Substring (0, idx);
1317 idx = cpspec.IndexOf ("..");
1318 NumberStyles nf = NumberStyles.HexNumber |
1319 NumberStyles.AllowTrailingWhite;
1320 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1321 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1322 string value = s.Substring (cpspec.Length + 1).Trim ();
1325 if (cp > char.MaxValue)
1330 for (int x = cp; x <= cpEnd; x++)
1331 if (!IsIgnorable (x))
1332 gurmukhi.Add ((char) x);
1335 for (int x = cp; x <= cpEnd; x++)
1336 if (!IsIgnorable (x))
1337 gujarati.Add ((char) x);
1340 for (int x = cp; x <= cpEnd; x++)
1341 if (!IsIgnorable (x))
1342 georgian.Add ((char) x);
1345 for (int x = cp; x <= cpEnd; x++)
1346 if (!IsIgnorable (x))
1347 thaana.Add ((char) x);
1352 gurmukhi.Sort (UCAComparer.Instance);
1353 gujarati.Sort (UCAComparer.Instance);
1354 georgian.Sort (UCAComparer.Instance);
1355 thaana.Sort (UCAComparer.Instance);
1356 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1357 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1358 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1359 orderedThaana = (char []) thaana.ToArray (typeof (char));
1362 void ParseJISOrder (string filename)
1366 using (StreamReader file =
1367 new StreamReader (filename)) {
1368 for (;file.Peek () >= 0; line++)
1369 ProcessJISOrderLine (file.ReadLine ());
1371 } catch (Exception) {
1372 Console.Error.WriteLine ("---- line {0}", line);
1377 char [] ws = new char [] {'\t', ' '};
1379 void ProcessJISOrderLine (string s)
1381 int idx = s.IndexOf ('#');
1383 s = s.Substring (0, idx).Trim ();
1386 idx = s.IndexOfAny (ws);
1389 // They start with "0x" so cut them out.
1390 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1391 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1392 jisJapanese.Add (new JISCharacter (cp, jis));
1395 void ParseCJK (string zhXML, string jaXML, string koXML)
1397 XmlDocument doc = new XmlDocument ();
1398 doc.XmlResolver = null;
1405 // Chinese Simplified
1408 offset = 0;//char.MaxValue - arr.Length;
1410 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1412 foreach (char c in s) {
1414 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1416 arr [(int) c - offset] = (ushort) v++;
1422 // Chinese Traditional
1425 offset = 0;//char.MaxValue - arr.Length;
1426 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1428 foreach (char c in s) {
1430 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1432 arr [(int) c - offset] = (ushort) v++;
1441 offset = 0;//char.MaxValue - arr.Length;
1443 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1446 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1447 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1448 arr [0x337E] = 0x8005;
1449 arr [0x337D] = 0x8006;
1450 arr [0x337C] = 0x8007;
1453 foreach (char c in s) {
1455 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1457 arr [(int) c - offset] = (ushort) v++;
1462 if (c == '\u662D') // U+337C
1464 if (c == '\u5927') // U+337D
1466 if (c == '\u5E73') // U+337B
1468 if (c == '\u660E') // U+337E
1470 if (c == '\u9686') // U+F9DC
1473 // FIXME: there are still remaining
1474 // characters after U+FA0C.
1475 // for (int k = 0; k < char.MaxValue; k++) {
1476 for (int k = 0; k < '\uFA0C'; k++) {
1477 if (decompIndex [k] == 0)
1479 if (decompValues [decompIndex [k]] == c /*&&
1480 decompLength [k] == 1*/ ||
1481 decompLength [k] == 3 &&
1482 decompValues [decompIndex [k] + 1] == c) {
1483 arr [k - offset] = (ushort) v++;
1492 // Korean weight is somewhat complex. It first shifts
1493 // Hangul category from 52-x to 80-x (they are anyways
1494 // computed). CJK ideographs are placed at secondary
1495 // weight, like XX YY 01 zz 01, where XX and YY are
1496 // corresponding "reset" value and zz is 41,43,45...
1498 // Unlike chs,cht and ja, Korean value is a combined
1499 // ushort which is computed as category
1503 offset = 0;//char.MaxValue - arr.Length;
1505 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1506 XmlElement sc = (XmlElement) reset.NextSibling;
1507 // compute "category" and "level 1" for the
1508 // target "reset" Hangle syllable
1509 char rc = reset.InnerText [0];
1510 int ri = ((int) rc - 0xAC00) + 1;
1512 ((ri / 254) * 256 + (ri % 254) + 2);
1513 // Place the characters after the target.
1516 foreach (char c in s) {
1517 arr [(int) c - offset] = p;
1518 cjkKOlv2 [(int) c - offset] = (byte) v;
1528 void FillIgnorables ()
1530 for (int i = 0; i <= char.MaxValue; i++) {
1531 if (Char.GetUnicodeCategory ((char) i) ==
1532 UnicodeCategory.OtherNotAssigned)
1534 if (IsIgnorable (i))
1535 ignorableFlags [i] |= 1;
1536 if (IsIgnorableSymbol (i))
1537 ignorableFlags [i] |= 2;
1538 if (IsIgnorableNonSpacing (i))
1539 ignorableFlags [i] |= 4;
1543 void ModifyUnidata ()
1545 // Modify some decomposition equivalence
1546 decompType [0xFE31] = 0;
1547 decompIndex [0xFE31] = 0;
1548 decompLength [0xFE31] = 0;
1549 decompType [0xFE32] = 0;
1550 decompIndex [0xFE32] = 0;
1551 decompLength [0xFE32] = 0;
1553 // Korean parens numbers
1554 for (int i = 0x3200; i <= 0x321C; i++)
1555 diacritical [i] = 0xA;
1556 for (int i = 0x3260; i <= 0x327B; i++)
1557 diacritical [i] = 0xC;
1559 // LAMESPEC: these remapping should not be done.
1560 // Windows have incorrect CJK compat mappings.
1561 decompValues [decompIndex [0x32A9]] = 0x91AB;
1562 decompLength [0x323B] = 1;
1563 decompValues [decompIndex [0x323B]] = 0x5B78;
1564 decompValues [decompIndex [0x32AB]] = 0x5B78;
1565 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1566 decompLength [0x3238] = 1;
1567 decompValues [decompIndex [0x3238]] = 0x52DE;
1568 decompValues [decompIndex [0x3298]] = 0x52DE;
1570 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1571 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1572 decompValues [decompIndex [0xFA0C]] = 0x5140;
1573 decompLength [0xFA0C] = 1;
1574 decompIndex [0xF929] = decompLength [0xF929] = 0;
1576 decompIndex [0xF92C] = decompLength [0xF92C] = 0;
1579 void ModifyParsedValues ()
1581 // number, secondary weights
1583 int [] numarr = numberSecondaryWeightBounds;
1584 for (int i = 0; i < numarr.Length; i += 2, weight++)
1585 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1586 if (Char.IsNumber ((char) cp))
1587 diacritical [cp] = weight;
1589 // Update name part of named characters
1590 for (int i = 0; i < sortableCharNames.Count; i++) {
1591 DictionaryEntry de =
1592 (DictionaryEntry) sortableCharNames [i];
1593 int cp = (int) de.Key;
1594 string renamed = null;
1596 case 0x2101: renamed = "A_1"; break;
1597 case 0x33C3: renamed = "A_2"; break;
1598 case 0x2105: renamed = "C_1"; break;
1599 case 0x2106: renamed = "C_2"; break;
1600 case 0x211E: renamed = "R1"; break;
1601 case 0x211F: renamed = "R2"; break;
1602 // Remove some of them!
1613 sortableCharNames.RemoveAt (i);
1617 if (renamed != null)
1618 sortableCharNames [i] =
1619 new DictionaryEntry (cp, renamed);
1623 void GenerateCore ()
1627 #region Specially ignored // 01
1628 // This will raise "Defined" flag up.
1629 foreach (char c in specialIgnore)
1630 map [(int) c] = new CharMapEntry (0, 0, 0);
1634 #region Variable weights
1635 // Controls : 06 03 - 06 3D
1637 for (int i = 0; i < 65536; i++) {
1638 if (IsIgnorable (i))
1641 uc = Char.GetUnicodeCategory (c);
1642 // NEL is whitespace but not ignored here.
1643 if (uc == UnicodeCategory.Control &&
1644 !Char.IsWhiteSpace (c) || c == '\u0085')
1645 AddCharMap (c, 6, 1);
1649 fillIndex [6] = 0x80;
1650 AddCharMapGroup ('\'', 6, 1, 0);
1651 AddCharMap ('\uFE63', 6, 1);
1653 // Hyphen/Dash : 06 81 - 06 90
1654 for (int i = 0; i < char.MaxValue; i++) {
1655 if (!IsIgnorable (i) &&
1656 Char.GetUnicodeCategory ((char) i) ==
1657 UnicodeCategory.DashPunctuation) {
1658 AddCharMapGroup2 ((char) i, 6, 1, 0);
1660 // SPECIAL: add 2027 and 2043
1661 // Maybe they are regarded the
1662 // same hyphens in "central"
1664 AddCharMap ('\u2027', 6, 1);
1665 AddCharMap ('\u2043', 6, 1);
1670 // Arabic variable weight chars 06 A0 -
1671 fillIndex [6] = 0xA0;
1673 for (int i = 0x64B; i <= 0x650; i++)
1674 AddArabicCharMap ((char) i);
1676 AddCharMapGroup ('\u0652', 6, 1, 0);
1678 AddCharMapGroup ('\u0651', 6, 1, 0);
1682 #region Nonspacing marks // 01
1683 // FIXME: 01 03 - 01 B6 ... annoyance :(
1685 // Combining diacritical marks: 01 DC -
1687 fillIndex [0x1] = 0x41;
1688 for (int i = 0x030E; i <= 0x0326; i++)
1689 if (!IsIgnorable (i))
1690 AddCharMap ((char) i, 0x1, 1);
1691 for (int i = 0x0329; i <= 0x0334; i++)
1692 if (!IsIgnorable (i))
1693 AddCharMap ((char) i, 0x1, 1);
1694 for (int i = 0x0339; i <= 0x0341; i++)
1695 if (!IsIgnorable (i))
1696 AddCharMap ((char) i, 0x1, 1);
1697 fillIndex [0x1] = 0x72;
1698 for (int i = 0x0346; i <= 0x0348; i++)
1699 if (!IsIgnorable (i))
1700 AddCharMap ((char) i, 0x1, 1);
1701 for (int i = 0x02BE; i <= 0x02BF; i++)
1702 if (!IsIgnorable (i))
1703 AddCharMap ((char) i, 0x1, 1);
1704 for (int i = 0x02C1; i <= 0x02C5; i++)
1705 if (!IsIgnorable (i))
1706 AddCharMap ((char) i, 0x1, 1);
1707 for (int i = 0x02CE; i <= 0x02CF; i++)
1708 if (!IsIgnorable (i))
1709 AddCharMap ((char) i, 0x1, 1);
1710 for (int i = 0x02D1; i <= 0x02D3; i++)
1711 if (!IsIgnorable (i))
1712 AddCharMap ((char) i, 0x1, 1);
1713 AddCharMap ('\u02DE', 0x1, 1);
1714 for (int i = 0x02E4; i <= 0x02E9; i++)
1715 if (!IsIgnorable (i))
1716 AddCharMap ((char) i, 0x1, 1);
1718 // FIXME: needs more love here (it should eliminate
1719 // all the hacky code above).
1720 for (int i = 0x0300; i < 0x0370; i++)
1721 if (!IsIgnorable (i) && diacritical [i] != 0
1722 /* especiall here*/ && !map [i].Defined)
1723 map [i] = new CharMapEntry (
1724 0x1, 0x1, diacritical [i]);
1726 fillIndex [0x1] = 0x94;
1727 // syriac dotted nonspacing marks
1728 AddCharMap ('\u0732', 0x1, 1);
1729 AddCharMap ('\u0735', 0x1, 1);
1730 AddCharMap ('\u0738', 0x1, 1);
1731 AddCharMap ('\u0739', 0x1, 1);
1732 AddCharMap ('\u073C', 0x1, 1);
1733 fillIndex [0x1] = 0x9F;
1734 for (int i = 0x0730; i <= 0x07B0; i++)
1735 if (!IsIgnorable (i) && !map [i].Defined)
1736 AddCharMap ((char) i, 0x1, 1);
1738 fillIndex [0x1] = 0x0C;
1739 for (int i = 0x0EC8; i <= 0x0ECD; i++)
1740 if (!IsIgnorable (i))
1741 AddCharMap ((char) i, 0x1, 1);
1743 // LAMESPEC: It should not stop at '\u20E1'. There are
1744 // a few more characters (that however results in
1745 // overflow of level 2 unless we start before 0xDD).
1746 fillIndex [0x1] = 0xDC;
1747 for (int i = 0x20d0; i <= 0x20e1; i++)
1748 AddCharMap ((char) i, 0x1, 1);
1750 // They are not part of Nonspacing marks, but have
1751 // only diacritical weight.
1752 for (int i = 0x3099; i <= 0x309C; i++)
1753 map [i] = new CharMapEntry (1, 1, 1);
1754 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1755 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1756 for (int i = 0x30FC; i <= 0x30FE; i++)
1757 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1762 #region Whitespaces // 07 03 -
1763 fillIndex [0x7] = 0x2;
1764 AddCharMap (' ', 0x7, 2);
1765 AddCharMap ('\u00A0', 0x7, 1);
1766 for (int i = 9; i <= 0xD; i++)
1767 AddCharMap ((char) i, 0x7, 1);
1768 for (int i = 0x2000; i <= 0x200B; i++)
1769 AddCharMap ((char) i, 0x7, 1);
1771 fillIndex [0x7] = 0x17;
1772 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1773 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1775 // Characters which used to represent layout control.
1776 // LAMESPEC: Windows developers seem to have thought
1777 // that those characters are kind of whitespaces,
1778 // while they aren't.
1779 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1780 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1783 // category 09 - continued symbols from 08
1784 fillIndex [0x9] = 2;
1786 for (int cp = 0x2300; cp <= 0x237A; cp++)
1787 AddCharMap ((char) cp, 0x9, 1, 0);
1790 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1791 foreach (DictionaryEntry de in arrowValues) {
1792 int idx = (int) de.Value;
1793 int cp = (int) de.Key;
1794 if (map [cp].Defined)
1796 fillIndex [0x9] = (byte) (0xD8 + idx);
1797 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1801 byte [] boxLv2 = new byte [128];
1802 for (int i = 0; i < boxLv2.Length; i++)
1804 foreach (DictionaryEntry de in boxValues) {
1805 int cp = (int) de.Key;
1806 int off = (int) de.Value;
1807 if (map [cp].Defined)
1810 fillIndex [0x9] = (byte) (0xE5 + off);
1811 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1814 fillIndex [0x9] = (byte) (0xE5 + off);
1815 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1818 // Some special characters (slanted)
1819 fillIndex [0x9] = 0xF4;
1820 AddCharMap ('\u2571', 0x9, 3);
1821 AddCharMap ('\u2572', 0x9, 3);
1822 AddCharMap ('\u2573', 0x9, 3);
1824 // FIXME: implement 0A
1826 fillIndex [0xA] = 2;
1827 // byte currency symbols
1828 for (int cp = 0; cp < 0x100; cp++) {
1829 uc = Char.GetUnicodeCategory ((char) cp);
1830 if (!IsIgnorable (cp) &&
1831 uc == UnicodeCategory.CurrencySymbol &&
1834 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1836 // byte other symbols
1837 for (int cp = 0; cp < 0x100; cp++) {
1839 continue; // SPECIAL: skip FIXME: why?
1840 uc = Char.GetUnicodeCategory ((char) cp);
1841 if (!IsIgnorable (cp) &&
1842 uc == UnicodeCategory.OtherSymbol ||
1843 cp == '\u00B5' || cp == '\u00B7')
1844 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1847 fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
1848 for (int cp = 0x2020; cp <= 0x2031; cp++)
1849 if (Char.IsPunctuation ((char) cp))
1850 AddCharMap ((char) cp, 0xA, 1, 0);
1851 // SPECIAL CASES: why?
1852 AddCharMap ('\u203B', 0xA, 1, 0);
1853 AddCharMap ('\u2040', 0xA, 1, 0);
1854 AddCharMap ('\u2041', 0xA, 1, 0);
1855 AddCharMap ('\u2042', 0xA, 1, 0);
1857 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1858 AddCharMap ((char) cp, 0xA, 1, 0);
1859 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1860 for (int cp = 0x2600; cp <= 0x2613; cp++)
1861 AddCharMap ((char) cp, 0xA, 1, 0);
1863 for (int cp = 0x2620; cp <= 0x2770; cp++)
1864 if (Char.IsSymbol ((char) cp))
1865 AddCharMap ((char) cp, 0xA, 1, 0);
1867 for (int i = 0x2440; i < 0x2460; i++)
1868 AddCharMap ((char) i, 0xA, 1, 0);
1872 #region Numbers // 0C 02 - 0C E1
1873 fillIndex [0xC] = 2;
1875 // 9F8 : Bengali "one less than the denominator"
1876 AddCharMap ('\u09F8', 0xC, 1);
1878 ArrayList numbers = new ArrayList ();
1879 for (int i = 0; i < 65536; i++)
1880 if (!IsIgnorable (i) &&
1881 Char.IsNumber ((char) i) &&
1882 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1885 ArrayList numberValues = new ArrayList ();
1886 foreach (int i in numbers)
1887 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1888 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1890 //foreach (DictionaryEntry de in numberValues)
1891 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1893 decimal prevValue = -1;
1894 foreach (DictionaryEntry de in numberValues) {
1895 int cp = (int) de.Key;
1896 decimal currValue = (decimal) de.Value;
1897 bool addnew = false;
1898 if (prevValue < currValue &&
1899 prevValue - (int) prevValue == 0 &&
1903 // Process Hangzhou and Roman numbers
1905 // There are some SPECIAL cases.
1906 if (currValue != 4) // no increment for 4
1910 if (currValue <= 10) {
1911 xcp = (int) prevValue + 0x2170 - 1;
1912 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1913 xcp = (int) prevValue + 0x2160 - 1;
1914 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1915 fillIndex [0xC] += 2;
1916 xcp = (int) prevValue + 0x3021 - 1;
1917 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1920 else if (currValue == 11)
1923 if (prevValue < currValue)
1924 prevValue = currValue;
1925 if (map [cp].Defined)
1927 // HangZhou and Roman are add later
1929 else if (0x3021 <= cp && cp < 0x302A
1930 || 0x2160 <= cp && cp < 0x216A
1931 || 0x2170 <= cp && cp < 0x217A)
1934 if (cp == 0x215B) // FIXME: why?
1935 fillIndex [0xC] += 2;
1936 else if (cp == 0x3021) // FIXME: why?
1938 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1939 if (addnew || cp <= '9') {
1940 int mod = (int) currValue - 1;
1942 if (1 <= currValue && currValue <= 10) {
1944 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1946 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1948 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1950 if (1 <= currValue && currValue <= 20) {
1952 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1954 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1956 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1960 if (cp != 0x09E7 && cp != 0x09EA)
1963 // Add special cases that are not regarded as
1964 // numbers in UnicodeCategory speak.
1967 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1968 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1970 else if (cp == '6') // FIXME: why?
1975 fillIndex [0xC] = 0xFF;
1976 AddCharMap ('\u221E', 0xC, 1);
1979 #region Letters and NonSpacing Marks (general)
1981 // ASCII Latin alphabets
1982 for (int i = 0; i < alphabets.Length; i++)
1983 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1986 // non-ASCII Latin alphabets
1987 // FIXME: there is no such characters that are placed
1988 // *after* "alphabets" array items. This is nothing
1989 // more than a hack that creates dummy weight for
1990 // primary characters.
1991 for (int i = 0x0080; i < 0x0300; i++) {
1992 if (!Char.IsLetter ((char) i))
1994 // For those Latin Letters which has NFKD are
1995 // not added as independent primary character.
1996 if (decompIndex [i] != 0)
1999 // 1.some alphabets have primarily
2000 // equivalent ASCII alphabets.
2001 // 2.some have independent primary weights,
2002 // but inside a-to-z range.
2003 // 3.there are some expanded characters that
2004 // are not part of Unicode Standard NFKD.
2005 // 4. some characters are letter in IsLetter
2006 // but not in sortkeys (maybe unicode version
2007 // difference caused it).
2009 // 1. skipping them does not make sense
2010 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2011 // case 0x184: case 0x185: case 0x186: case 0x189:
2012 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2013 // case 0x194: case 0x195: case 0x196: case 0x19A:
2014 // case 0x19B: case 0x19C:
2015 // 2. skipping them does not make sense
2016 // case 0x14A: // Ng
2017 // case 0x14B: // ng
2021 case 0xDE: // Icelandic Thorn
2022 case 0xFE: // Icelandic Thorn
2023 case 0xDF: // German ss
2024 case 0xFF: // German ss
2026 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2027 // not classified yet
2028 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2029 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2030 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2034 AddCharMapGroup ((char) i, 0xE, 1, 0);
2038 fillIndex [0xF] = 02;
2039 for (int i = 0x0380; i < 0x0390; i++)
2040 if (Char.IsLetter ((char) i))
2041 AddLetterMap ((char) i, 0xF, 1);
2042 fillIndex [0xF] = 02;
2043 for (int i = 0x0391; i < 0x03CF; i++)
2044 if (Char.IsLetter ((char) i))
2045 AddLetterMap ((char) i, 0xF, 1);
2046 fillIndex [0xF] = 0x40;
2047 for (int i = 0x03D0; i < 0x0400; i++)
2048 if (Char.IsLetter ((char) i))
2049 AddLetterMap ((char) i, 0xF, 1);
2052 // Cyrillic letters are sorted like Latin letters i.e.
2053 // containing culture-specific letters between the
2054 // standard Cyrillic sequence.
2056 // We can't use UCA here; it has different sorting.
2057 char [] orderedCyrillic = new char [] {
2058 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2059 '\u0452', // DJE for Serbocroatian
2061 '\u0454', // IE for Ukrainian
2065 '\u0456', // Byelorussian-Ukrainian I
2075 '\u043F', '\u0440', '\u0441', '\u0442',
2076 '\u045B', // TSHE for Serbocroatian
2078 '\u045E', // Short U for Byelorussian
2079 '\u04B1', // Straight U w/ stroke (diacritical!)
2080 '\u0444', '\u0445', '\u0446', '\u0447',
2082 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2083 '\u044D', '\u044E', '\u044F'};
2085 // For some characters here is a map to basic cyrillic
2086 // letters. See UnicodeData.txt character names for
2087 // the sources. Here I simply declare an equiv. array.
2088 // The content characters are map from U+490(,491),
2089 // skipping small letters.
2090 char [] cymap_src = new char [] {
2091 '\u0433', '\u0433', '\u0433', '\u0436',
2092 '\u0437', '\u043A', '\u043A', '\u043A',
2093 '\u043A', '\u043D', '\u043D', '\u043F',
2094 '\u0445', '\u0441', '\u0442', '\u0443',
2095 '\u0443', '\u0445', '\u0446', '\u0447',
2096 '\u0447', '\u0432', '\u0435', '\u0435',
2097 '\u0406', '\u0436', '\u043A', '\u043D',
2098 '\u0447', '\u0435'};
2100 fillIndex [0x10] = 0x8D;
2101 for (int i = 0x0460; i < 0x0481; i++) {
2102 if (Char.IsLetter ((char) i)) {
2104 // U+476/477 have the same
2105 // primary weight as U+474/475.
2106 fillIndex [0x10] -= 3;
2107 AddLetterMap ((char) i, 0x10, 3);
2111 fillIndex [0x10] = 0x6;
2112 for (int i = 0; i < orderedCyrillic.Length; i++) {
2113 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2114 if (!IsIgnorable ((int) c) &&
2115 Char.IsLetter (c) &&
2117 AddLetterMap (c, 0x10, 0);
2118 fillIndex [0x10] += 3;
2122 for (int i = 0; i < cymap_src.Length; i++) {
2123 char c = cymap_src [i];
2124 fillIndex [0x10] = map [c].Level1;
2125 AddLetterMap ((char) (0x0490 + i * 2),
2130 fillIndex [0x11] = 0x3;
2131 for (int i = 0x0531; i < 0x0586; i++)
2132 if (Char.IsLetter ((char) i))
2133 AddLetterMap ((char) i, 0x11, 1);
2137 fillIndex [0x12] = 0x2;
2138 for (int i = 0x05D0; i < 0x05FF; i++)
2139 if (Char.IsLetter ((char) i))
2140 AddLetterMap ((char) i, 0x12, 1);
2142 fillIndex [0x1] = 0x3;
2143 for (int i = 0x0591; i <= 0x05C2; i++) {
2144 if (i == 0x05A3 || i == 0x05BB)
2147 AddCharMap ((char) i, 0x1, 1);
2151 fillIndex [0x1] = 0x8E;
2152 fillIndex [0x13] = 0x3;
2153 for (int i = 0x0621; i <= 0x064A; i++) {
2155 if (Char.GetUnicodeCategory ((char) i)
2156 != UnicodeCategory.OtherLetter) {
2157 // FIXME: arabic nonspacing marks are
2158 // in different order.
2159 AddCharMap ((char) i, 0x1, 1);
2162 // map [i] = new CharMapEntry (0x13,
2163 // (byte) arabicLetterPrimaryValues [i], 1);
2165 (byte) arabicLetterPrimaryValues [i];
2166 AddLetterMap ((char) i, 0x13, 0);
2168 fillIndex [0x13] = 0x84;
2169 for (int i = 0x0674; i < 0x06D6; i++)
2170 if (Char.IsLetter ((char) i))
2171 AddLetterMap ((char) i, 0x13, 1);
2174 // FIXME: it does seem straight codepoint mapping.
2175 fillIndex [0x14] = 04;
2176 for (int i = 0x0901; i < 0x0905; i++)
2177 if (!IsIgnorable (i))
2178 AddLetterMap ((char) i, 0x14, 2);
2179 fillIndex [0x14] = 0xB;
2180 for (int i = 0x0905; i < 0x093A; i++) {
2182 AddCharMap ('\u0929', 0x14, 0, 8);
2184 AddCharMap ('\u0931', 0x14, 0, 8);
2186 AddCharMap ('\u0934', 0x14, 0, 8);
2187 if (Char.IsLetter ((char) i))
2188 AddLetterMap ((char) i, 0x14, 4);
2190 AddCharMap ('\u0960', 0x14, 4);
2192 AddCharMap ('\u0961', 0x14, 4);
2194 fillIndex [0x14] = 0xDA;
2195 for (int i = 0x093E; i < 0x0945; i++)
2196 if (!IsIgnorable (i))
2197 AddLetterMap ((char) i, 0x14, 2);
2198 fillIndex [0x14] = 0xEC;
2199 for (int i = 0x0945; i < 0x094F; i++)
2200 if (!IsIgnorable (i))
2201 AddLetterMap ((char) i, 0x14, 2);
2205 fillIndex [0x15] = 02;
2206 for (int i = 0x0980; i < 0x9FF; i++) {
2207 if (IsIgnorable (i))
2210 fillIndex [0x15] = 0x3B;
2211 switch (Char.GetUnicodeCategory ((char) i)) {
2212 case UnicodeCategory.NonSpacingMark:
2213 case UnicodeCategory.DecimalDigitNumber:
2214 case UnicodeCategory.OtherNumber:
2217 AddLetterMap ((char) i, 0x15, 1);
2220 fillIndex [0x1] = 0x3;
2221 for (int i = 0x0981; i < 0x0A00; i++)
2222 if (Char.GetUnicodeCategory ((char) i) ==
2223 UnicodeCategory.NonSpacingMark)
2224 AddCharMap ((char) i, 0x1, 1);
2226 // Gurmukhi. orderedGurmukhi is from UCA
2227 // FIXME: it does not look equivalent to UCA.
2228 fillIndex [0x16] = 04;
2229 fillIndex [0x1] = 3;
2230 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2231 char c = orderedGurmukhi [i];
2232 if (IsIgnorable ((int) c))
2234 if (IsIgnorableNonSpacing (c)) {
2235 AddLetterMap (c, 0x1, 1);
2238 if (c == '\u0A3C' || c == '\u0A4D' ||
2239 '\u0A66' <= c && c <= '\u0A71')
2241 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2243 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2245 AddLetterMap (c, 0x16, shift);
2248 // Gujarati. orderedGujarati is from UCA
2249 fillIndex [0x17] = 0x4;
2251 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2252 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2253 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2254 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2255 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2256 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2257 // letters go first.
2258 for (int i = 0; i < orderedGujarati.Length; i++) {
2260 char c = orderedGujarati [i];
2261 if (Char.IsLetter (c)) {
2263 if (c == '\u0AB3' || c == '\u0A32')
2265 if (c == '\u0A33') {
2266 AddCharMap ('\u0A32', 0x17, 0);
2267 AddCharMap ('\u0A33', 0x17, 4, 4);
2271 AddCharMap ('\u0AE0', 0x17, 0, 5);
2272 AddCharMap (c, 0x17, 4);
2275 AddCharMap ('\u0AB3', 0x17, 6);
2279 byte gujaratiShift = 4;
2280 fillIndex [0x17] = 0xC0;
2281 for (int i = 0; i < orderedGujarati.Length; i++) {
2282 char c = orderedGujarati [i];
2283 if (fillIndex [0x17] == 0xCC)
2285 if (!Char.IsLetter (c)) {
2288 AddCharMap ('\u0A81', 0x17, 2);
2291 AddLetterMap (c, 0x17, gujaratiShift);
2296 fillIndex [0x1] = 03;
2297 fillIndex [0x18] = 02;
2298 for (int i = 0x0B00; i < 0x0B7F; i++) {
2299 switch (Char.GetUnicodeCategory ((char) i)) {
2300 case UnicodeCategory.NonSpacingMark:
2301 case UnicodeCategory.DecimalDigitNumber:
2302 AddLetterMap ((char) i, 0x1, 1);
2305 AddLetterMap ((char) i, 0x18, 1);
2309 fillIndex [0x19] = 2;
2310 AddCharMap ('\u0BD7', 0x19, 0);
2311 fillIndex [0x19] = 0xA;
2313 for (int i = 0x0B82; i <= 0x0B94; i++)
2314 if (!IsIgnorable ((char) i))
2315 AddCharMap ((char) i, 0x19, 2);
2317 fillIndex [0x19] = 0x28;
2318 // The array for Tamil consonants is a constant.
2319 // Windows have almost similar sequence to TAM from
2320 // tamilnet but a bit different in Grantha.
2321 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2322 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2324 fillIndex [0x19] = 0x82;
2325 for (int i = 0x0BBE; i < 0x0BCD; i++)
2326 if (Char.GetUnicodeCategory ((char) i) ==
2327 UnicodeCategory.SpacingCombiningMark
2329 AddLetterMap ((char) i, 0x19, 2);
2332 fillIndex [0x1A] = 0x4;
2333 for (int i = 0x0C00; i < 0x0C62; i++) {
2334 if (i == 0x0C55 || i == 0x0C56)
2336 AddCharMap ((char) i, 0x1A, 3);
2337 char supp = (i == 0x0C0B) ? '\u0C60':
2338 i == 0x0C0C ? '\u0C61' : char.MinValue;
2339 if (supp == char.MinValue)
2341 AddCharMap (supp, 0x1A, 3);
2345 fillIndex [0x1B] = 4;
2346 for (int i = 0x0C80; i < 0x0CE5; i++) {
2347 if (i == 0x0CD5 || i == 0x0CD6)
2349 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2350 continue; // shift after 0xCB9
2351 AddCharMap ((char) i, 0x1B, 3);
2353 // SPECIAL CASES: but why?
2354 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2355 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2356 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2359 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2363 fillIndex [0x1C] = 2;
2364 for (int i = 0x0D02; i < 0x0D61; i++)
2365 // FIXME: I avoided MSCompatUnicodeTable usage
2366 // here (it results in recursion). So check if
2367 // using NonSpacingMark makes sense or not.
2368 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2369 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2370 AddCharMap ((char) i, 0x1C, 1);
2372 // Thai ... note that it breaks 0x1E wall after E2B!
2373 // Also, all Thai characters have level 2 value 3.
2374 fillIndex [0x1E] = 2;
2375 for (int i = 0xE40; i <= 0xE44; i++)
2376 AddCharMap ((char) i, 0x1E, 1, 3);
2377 for (int i = 0xE01; i < 0xE2B; i++)
2378 AddCharMap ((char) i, 0x1E, 6, 3);
2379 fillIndex [0x1F] = 5;
2380 for (int i = 0xE2B; i < 0xE30; i++)
2381 AddCharMap ((char) i, 0x1F, 6, 3);
2382 fillIndex [0x1F] = 0x1E;
2383 for (int i = 0xE30; i < 0xE3B; i++)
2384 AddCharMap ((char) i, 0x1F, 1, 3);
2385 // some Thai characters remains.
2386 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2387 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2388 foreach (char c in specialThai)
2389 AddCharMap (c, 0x1F, 1);
2392 fillIndex [0x1F] = 2;
2393 for (int i = 0xE80; i < 0xEDF; i++)
2394 if (Char.IsLetter ((char) i))
2395 AddCharMap ((char) i, 0x1F, 1);
2397 // Georgian. orderedGeorgian is from UCA DUCET.
2398 fillIndex [0x21] = 5;
2399 for (int i = 0; i < orderedGeorgian.Length; i++) {
2400 char c = orderedGeorgian [i];
2401 if (map [(int) c].Defined)
2403 AddCharMap (c, 0x21, 0);
2405 AddCharMap ((char) (c - 0x30), 0x21, 0);
2406 fillIndex [0x21] += 5;
2410 fillIndex [0x22] = 2;
2411 int kanaOffset = 0x3041;
2412 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2414 for (int gyo = 0; gyo < 9; gyo++) {
2415 for (int dan = 0; dan < 5; dan++) {
2416 if (gyo == 7 && dan % 2 == 1) {
2419 kanaOffset -= 2; // There is no space for yi and ye.
2422 int cp = kanaOffset + dan * kanaLines [gyo];
2423 // small lines (a-gyo, ya-gyo)
2424 if (gyo == 0 || gyo == 7) {
2425 AddKanaMap (cp, 1); // small
2426 AddKanaMap (cp + 1, 1);
2429 AddKanaMap (cp, kanaLines [gyo]);
2433 // add small 'ka' (before normal one)
2434 AddKanaMap (0x30F5, 1);
2438 // add small 'ke' (before normal one)
2439 AddKanaMap (0x30F6, 1);
2443 // add small 'Tsu' (before normal one)
2444 AddKanaMap (0x3063, 1);
2448 fillIndex [0x22] += 3;
2449 kanaOffset += 5 * kanaLines [gyo];
2452 // Wa-gyo is almost special, so I just manually add.
2453 AddLetterMap ((char) 0x308E, 0x22, 0);
2454 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2455 AddLetterMap ((char) 0x308F, 0x22, 0);
2456 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2458 AddLetterMap ((char) 0x3090, 0x22, 0);
2459 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2460 fillIndex [0x22] += 2;
2461 // no "Wu" in Japanese.
2462 AddLetterMap ((char) 0x3091, 0x22, 0);
2463 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2465 AddLetterMap ((char) 0x3092, 0x22, 0);
2466 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2468 fillIndex [0x22] = 0x80;
2469 AddLetterMap ((char) 0x3093, 0x22, 0);
2470 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2472 // JIS Japanese square chars.
2473 fillIndex [0x22] = 0x97;
2474 jisJapanese.Sort (JISComparer.Instance);
2475 foreach (JISCharacter j in jisJapanese)
2476 if (0x3300 <= j.CP && j.CP <= 0x3357)
2477 AddCharMap ((char) j.CP, 0x22, 1);
2478 // non-JIS Japanese square chars.
2479 nonJisJapanese.Sort (NonJISComparer.Instance);
2480 foreach (NonJISCharacter j in nonJisJapanese)
2481 AddCharMap ((char) j.CP, 0x22, 1);
2484 fillIndex [0x23] = 0x02;
2485 for (int i = 0x3105; i <= 0x312C; i++)
2486 AddCharMap ((char) i, 0x23, 1);
2488 // Estrangela: ancient Syriac
2489 fillIndex [0x24] = 0x0B;
2490 // FIXME: is 0x71E really alternative form?
2491 ArrayList syriacAlternatives = new ArrayList (
2492 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2493 for (int i = 0x0710; i <= 0x072C; i++) {
2494 if (i == 0x0711) // NonSpacingMark
2496 if (syriacAlternatives.Contains (i))
2498 AddCharMap ((char) i, 0x24, 4);
2503 foreach (int cp in syriacAlternatives)
2504 map [cp] = new CharMapEntry (0x24,
2505 (byte) (map [cp - 1].Level1 + 2),
2507 // FIXME: Syriac NonSpacingMark should go here.
2510 // FIXME: it turned out that it does not look like UCA
2511 fillIndex [0x24] = 0x6E;
2512 for (int i = 0; i < orderedThaana.Length; i++) {
2513 char c = orderedThaana [i];
2514 if (IsIgnorableNonSpacing ((int) c))
2516 AddCharMap (c, 0x24, 2);
2517 if (c == '\u0782') // SPECIAL CASE: why?
2518 fillIndex [0x24] += 2;
2522 // FIXME: Add more culture-specific letters (that are
2523 // not supported in Windows collation) here.
2525 // Surrogate ... they are computed.
2530 // Unlike UCA Windows Hangul sequence mixes Jongseong
2531 // with Choseong sequence as well as Jungseong,
2532 // adjusted to have the same primary weight for the
2533 // same base character. So it is impossible to compute
2536 // Here I introduce an ordered sequence of mixed
2537 // 'commands' and 'characters' that is similar to
2539 // - ',' increases primary weight.
2540 // - [A B] means a range, increasing index
2541 // - {A B} means a range, without increasing index
2542 // - '=' is no operation (it means the characters
2543 // of both sides have the same weight).
2544 // - '>' inserts a Hangul Syllable block that
2545 // contains 0x251 characters.
2546 // - '<' decreases the index
2547 // - '0'-'9' means skip count
2548 // - whitespaces are ignored
2551 string hangulSequence =
2552 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2553 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2554 + "<{\u1113 \u1116}, \u3165,"
2555 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2556 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2557 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2558 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2559 + "[\u11D1 \u11D2], \u11B2,"
2560 + "[\u11D3 \u11D5], \u11B3,"
2561 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2562 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2563 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2564 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2565 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2566 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2567 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2568 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2569 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2570 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2571 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2572 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2573 + "\u11F1,, \u11F2,,,"
2574 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2575 + "<\u114D, \u110D,, >"
2576 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2577 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2578 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2579 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2580 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2584 byte hangulCat = 0x52;
2585 fillIndex [hangulCat] = 0x2;
2587 int syllableBlock = 0;
2588 for (int n = 0; n < hangulSequence.Length; n++) {
2589 char c = hangulSequence [n];
2591 if (Char.IsWhiteSpace (c))
2597 IncrementSequentialIndex (ref hangulCat);
2600 if (fillIndex [hangulCat] == 2)
2601 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2602 fillIndex [hangulCat]--;
2605 IncrementSequentialIndex (ref hangulCat);
2606 for (int l = 0; l < 0x15; l++)
2607 for (int v = 0; v < 0x1C; v++) {
2609 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2610 IncrementSequentialIndex (ref hangulCat);
2615 start = hangulSequence [n + 1];
2616 end = hangulSequence [n + 3];
2617 for (int i = start; i <= end; i++) {
2618 AddCharMap ((char) i, hangulCat, 0);
2620 IncrementSequentialIndex (ref hangulCat);
2622 n += 4; // consumes 5 characters for this operation
2625 start = hangulSequence [n + 1];
2626 end = hangulSequence [n + 3];
2627 for (int i = start; i <= end; i++)
2628 AddCharMap ((char) i, hangulCat, 0);
2629 n += 4; // consumes 5 characters for this operation
2632 AddCharMap (c, hangulCat, 0);
2638 for (int i = 0x3200; i < 0x3300; i++) {
2639 if (IsIgnorable (i) || map [i].Defined)
2643 if (decompLength [i] == 4 &&
2644 decompValues [decompIndex [i]] == '(')
2645 ch = decompIndex [i] + 1;
2647 else if (decompLength [i] == 2 &&
2648 decompValues [decompIndex [i] + 1] == '\u1161')
2649 ch = decompIndex [i];
2650 else if (decompLength [i] == 1)
2651 ch = decompIndex [i];
2654 ch = decompValues [ch];
2655 if (ch < 0x1100 || 0x1200 < ch &&
2656 ch < 0xAC00 || 0xD800 < ch)
2660 int offset = i < 0x3260 ? 1 : 0;
2661 if (0x326E <= i && i <= 0x3273)
2664 map [i] = new CharMapEntry (map [ch].Category,
2665 (byte) (map [ch].Level1 + offset),
2667 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2673 // Letterlike characters and CJK compatibility square
2674 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2675 int [] counts = new int ['Z' - 'A' + 1];
2676 char [] namedChars = new char [sortableCharNames.Count];
2678 foreach (DictionaryEntry de in sortableCharNames) {
2679 counts [((string) de.Value) [0] - 'A']++;
2680 namedChars [nCharNames++] = (char) ((int) de.Key);
2682 nCharNames = 0; // reset
2683 for (int a = 0; a < counts.Length; a++) {
2684 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2685 for (int i = 0; i < counts [a]; i++)
2686 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2687 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2690 // CJK unified ideograph.
2692 fillIndex [cjkCat] = 0x2;
2693 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2694 if (!IsIgnorable (cp))
2695 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2696 // CJK Extensions goes here.
2697 // LAMESPEC: With this Windows style CJK layout, it is
2698 // impossible to add more CJK ideograph i.e. 0x9FA6-
2699 // 0x9FBB can never be added w/o breaking compat.
2700 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2701 if (!IsIgnorable (cp))
2702 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2704 // PrivateUse ... computed.
2705 // remaining Surrogate ... computed.
2707 #region Special "biggest" area (FF FF)
2708 fillIndex [0xFF] = 0xFF;
2709 char [] specialBiggest = new char [] {
2710 '\u3005', '\u3031', '\u3032', '\u309D',
2711 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2712 '\uFE7C', '\uFE7D', '\uFF70'};
2713 foreach (char c in specialBiggest)
2714 AddCharMap (c, 0xFF, 0);
2717 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2718 // non-alphanumeric ASCII except for: + - < = > '
2719 for (int i = 0x21; i < 0x7F; i++) {
2720 if (Char.IsLetterOrDigit ((char) i)
2721 || "+-<=>'".IndexOf ((char) i) >= 0)
2722 continue; // they are not added here.
2723 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2724 // Insert 3001 after ',' and 3002 after '.'
2726 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2728 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2730 AddCharMap ('\uFE30', 0x7, 1, 0);
2734 #region 07 - Punctuations and something else
2735 for (int i = 0xA0; i < char.MaxValue; i++) {
2736 if (IsIgnorable (i))
2739 // FIXME: actually those reset should not be
2740 // done but here I put for easy goal.
2742 fillIndex [0x7] = 0xE2;
2744 fillIndex [0x7] = 0x77;
2756 switch (Char.GetUnicodeCategory ((char) i)) {
2757 case UnicodeCategory.OtherPunctuation:
2758 case UnicodeCategory.ClosePunctuation:
2759 case UnicodeCategory.OpenPunctuation:
2760 case UnicodeCategory.InitialQuotePunctuation:
2761 case UnicodeCategory.FinalQuotePunctuation:
2762 case UnicodeCategory.ModifierSymbol:
2763 // SPECIAL CASES: // 0xA
2764 if (0x2020 <= i && i <= 0x2031)
2766 AddCharMapGroup ((char) i, 0x7, 1, 0);
2769 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2770 goto case UnicodeCategory.OtherPunctuation;
2775 // FIXME: it should not need to reset level 1, but
2776 // it's for easy goal.
2777 fillIndex [0x7] = 0xB6;
2778 for (int i = 0x2400; i <= 0x2421; i++)
2779 AddCharMap ((char) i, 0x7, 1, 0);
2782 // FIXME: for 07 xx we need more love.
2784 // Characters w/ diacritical marks (NFKD)
2785 for (int i = 0; i <= char.MaxValue; i++) {
2786 if (map [i].Defined || IsIgnorable (i))
2788 if (decompIndex [i] == 0)
2791 int start = decompIndex [i];
2792 int primaryChar = decompValues [start];
2795 int length = decompLength [i];
2796 // special processing for parenthesized ones.
2798 decompValues [start] == '(' &&
2799 decompValues [start + 2] == ')') {
2800 primaryChar = decompValues [start + 1];
2804 if (map [primaryChar].Level1 == 0)
2807 for (int l = 1; l < length; l++) {
2808 int c = decompValues [start + l];
2809 if (map [c].Level1 != 0)
2811 secondary += diacritical [c];
2815 map [i] = new CharMapEntry (
2816 map [primaryChar].Category,
2817 map [primaryChar].Level1,
2822 // category 08 - symbols
2823 fillIndex [0x8] = 2;
2824 // Here Windows mapping is not straightforward. It is
2825 // not based on computation but seems manual sorting.
2826 AddCharMapGroup ('+', 0x8, 1, 0); // plus
2827 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2828 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2829 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2830 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2831 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2832 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2833 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2834 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2835 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2836 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2837 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2838 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2840 for (int cp = 0; cp < 0x2300; cp++) {
2841 if (cp == 0xAC) // SPECIAL CASE: skip
2844 cp = 0x2200; // skip to 2200
2845 fillIndex [0x8] = 0x21;
2848 fillIndex [0x8] = 0x3;
2850 fillIndex [0x8] = 0xB9;
2851 if (!map [cp].Defined &&
2852 // Char.GetUnicodeCategory ((char) cp) ==
2853 // UnicodeCategory.MathSymbol)
2854 Char.IsSymbol ((char) cp))
2855 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
2856 // SPECIAL CASES: no idea why Windows sorts as such
2859 AddCharMap ('\u227B', 0x8, 1, 0);
2860 AddCharMap ('\u22B1', 0x8, 1, 0);
2863 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2864 AddCharMapGroup ('\u226A', 0x8, 1, 0);
2865 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2866 AddCharMapGroup ('\u226B', 0x8, 1, 0);
2869 AddCharMap ('\u01C0', 0x8, 1, 0);
2870 AddCharMap ('\u01C1', 0x8, 1, 0);
2871 AddCharMap ('\u01C2', 0x8, 1, 0);
2876 #region Level2 adjustment
2878 diacritical [0x624] = 0x5;
2879 diacritical [0x626] = 0x7;
2880 diacritical [0x622] = 0x9;
2881 diacritical [0x623] = 0xA;
2882 diacritical [0x625] = 0xB;
2883 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2884 diacritical [0x64A] = 0x7; // Yaa'
2886 for (int i = 0; i < char.MaxValue; i++) {
2888 byte cat = map [i].Category;
2890 case 0xE: // Latin diacritics
2891 case 0x22: // Japanese: circled characters
2892 mod = diacritical [i];
2894 case 0x13: // Arabic
2895 if (diacritical [i] == 0 && i >= 0xFE8D)
2896 mod = 0x8; // default for arabic
2899 if (0x52 <= cat && cat <= 0x7F) // Hangul
2900 mod = diacritical [i];
2902 map [i] = new CharMapEntry (
2903 cat, map [i].Level1, mod);
2907 // FIXME: this is hack but those NonSpacingMark
2908 // characters and still undefined are likely to
2910 for (int i = 0; i < char.MaxValue; i++)
2911 if (!map [i].Defined &&
2913 Char.GetUnicodeCategory ((char) i) ==
2914 UnicodeCategory.NonSpacingMark)
2915 AddCharMap ((char) i, 1, 1);
2917 // FIXME: this is hack but those Symbol characters
2918 // are likely to fall into 0xA category.
2919 for (int i = 0; i < char.MaxValue; i++)
2920 if (!map [i].Defined &&
2922 Char.IsSymbol ((char) i))
2923 AddCharMap ((char) i, 0xA, 1);
2926 private void IncrementSequentialIndex (ref byte hangulCat)
2928 fillIndex [hangulCat]++;
2929 if (fillIndex [hangulCat] == 0) { // overflown
2931 fillIndex [hangulCat] = 0x2;
2935 // Reset fillIndex to fixed value and call AddLetterMap().
2936 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2938 fillIndex [category] = alphaWeight;
2939 AddLetterMap (c, category, 0);
2941 ArrayList al = latinMap [c] as ArrayList;
2945 foreach (int cp in al)
2946 AddLetterMap ((char) cp, category, 0);
2949 private void AddKanaMap (int i, byte voices)
2951 for (byte b = 0; b < voices; b++) {
2952 char c = (char) (i + b);
2953 byte arg = (byte) (b > 0 ? b + 2 : 0);
2955 AddLetterMapCore (c, 0x22, 0, arg);
2957 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2961 private void AddLetterMap (char c, byte category, byte updateCount)
2963 AddLetterMapCore (c, category, updateCount, 0);
2966 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2969 // <small> updates index
2970 c2 = ToSmallForm (c);
2972 AddCharMapGroup (c2, category, updateCount, level2);
2973 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2974 if (c2 != c && !map [(int) c2].Defined)
2975 AddLetterMapCore (c2, category, 0, level2);
2976 bool doUpdate = true;
2977 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2980 AddCharMapGroup (c, category, 0, level2);
2982 fillIndex [category] += updateCount;
2985 private bool AddCharMap (char c, byte category, byte increment)
2987 return AddCharMap (c, category, increment, 0);
2990 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2992 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2993 return false; // do nothing
2994 map [(int) c] = new CharMapEntry (category,
2995 category == 1 ? alt : fillIndex [category],
2996 category == 1 ? fillIndex [category] : alt);
2997 fillIndex [category] += increment;
3001 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
3003 char c2 = ToSmallFormTail (c);
3005 AddCharMap (c2, category, updateCount, 0);
3007 AddCharMap (c, category, updateCount, 0);
3009 c2 = ToFullWidthTail (c);
3011 AddCharMapGroupTail (c2, category, updateCount);
3015 // Adds characters to table in the order below
3016 // (+ increases weight):
3020 // <full> | <super> | <sub>
3021 // <circle> | <wide> (| <narrow>)
3025 // level2 is fixed (does not increase).
3026 int [] sameWeightItems = new int [] {
3027 DecompositionFraction,
3031 DecompositionCircle,
3033 DecompositionNarrow,
3035 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3037 if (map [(int) c].Defined)
3040 char small = char.MinValue;
3041 char vertical = char.MinValue;
3042 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3044 object smv = nfkd [(byte) DecompositionSmall];
3046 small = (char) ((int) smv);
3047 object vv = nfkd [(byte) DecompositionVertical];
3049 vertical = (char) ((int) vv);
3052 // <small> updates index
3053 if (small != char.MinValue)
3054 AddCharMap (small, category, updateCount);
3057 AddCharMap (c, category, 0, level2);
3060 foreach (int weight in sameWeightItems) {
3061 object wv = nfkd [(byte) weight];
3063 AddCharMap ((char) ((int) wv), category, 0, level2);
3067 // update index here.
3068 fillIndex [category] += updateCount;
3070 if (vertical != char.MinValue)
3071 AddCharMap (vertical, category, updateCount, level2);
3074 private void AddCharMapCJK (char c, ref byte category)
3076 AddCharMap (c, category, 0, 0);
3077 IncrementSequentialIndex (ref category);
3079 // Special. I wonder why but Windows skips 9E F9.
3080 if (category == 0x9E && fillIndex [category] == 0xF9)
3081 IncrementSequentialIndex (ref category);
3084 private void AddCharMapGroupCJK (char c, ref byte category)
3086 AddCharMapCJK (c, ref category);
3088 // LAMESPEC: see below.
3089 if (c == '\u5B78') {
3090 AddCharMapCJK ('\u32AB', ref category);
3091 AddCharMapCJK ('\u323B', ref category);
3093 if (c == '\u52DE') {
3094 AddCharMapCJK ('\u3298', ref category);
3095 AddCharMapCJK ('\u3238', ref category);
3098 AddCharMapCJK ('\u32A2', ref category);
3100 // Especially this mapping order totally does
3101 // not make sense to me.
3102 AddCharMapCJK ('\u32A9', ref category);
3104 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3107 for (byte weight = 0; weight <= 0x12; weight++) {
3108 object wv = nfkd [weight];
3113 // Special: they are ignored in this area.
3114 // FIXME: check if it is sane
3115 if (0xF900 <= w && w <= 0xFAD9)
3117 // LAMESPEC: on Windows some of CJK characters
3118 // in 3200-32B0 are incorrectly mapped. They
3119 // mix Chinise and Japanese Kanji when
3120 // ordering those characters.
3122 case 0x32A2: case 0x3298: case 0x3238:
3123 case 0x32A9: case 0x323B: case 0x32AB:
3127 AddCharMapCJK ((char) w, ref category);
3131 // For now it is only for 0x7 category.
3132 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3134 char small = char.MinValue;
3135 char vertical = char.MinValue;
3136 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3138 object smv = nfkd [(byte) DecompositionSmall];
3140 small = (char) ((int) smv);
3141 object vv = nfkd [(byte) DecompositionVertical];
3143 vertical = (char) ((int) vv);
3146 // <small> updates index
3147 if (small != char.MinValue)
3148 // SPECIAL CASE excluded (FIXME: why?)
3149 if (small != '\u2024')
3150 AddCharMap (small, category, updateCount);
3153 AddCharMap (c, category, updateCount, level2);
3155 // Since nfkdMap is problematic to have two or more
3156 // NFKD to an identical character, here I iterate all.
3157 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3158 if (decompLength [c2] == 1 &&
3159 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3160 switch (decompType [c2]) {
3161 case DecompositionCompat:
3162 AddCharMap ((char) c2, category, updateCount, level2);
3168 if (vertical != char.MinValue)
3169 // SPECIAL CASE excluded (FIXME: why?)
3170 if (vertical != '\uFE33' && vertical != '\uFE34')
3171 AddCharMap (vertical, category, updateCount, level2);
3174 private void AddArabicCharMap (char c)
3177 byte updateCount = 1;
3181 AddCharMap (c, category, 0, level2);
3183 // Since nfkdMap is problematic to have two or more
3184 // NFKD to an identical character, here I iterate all.
3185 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3186 if (decompLength [c2] == 0)
3188 int idx = decompIndex [c2] + decompLength [c2] - 1;
3189 if ((int) (decompValues [idx]) == (int) c)
3190 AddCharMap ((char) c2, category,
3193 fillIndex [category] += updateCount;
3196 char ToFullWidth (char c)
3198 return ToDecomposed (c, DecompositionFull, false);
3201 char ToFullWidthTail (char c)
3203 return ToDecomposed (c, DecompositionFull, true);
3206 char ToSmallForm (char c)
3208 return ToDecomposed (c, DecompositionSmall, false);
3211 char ToSmallFormTail (char c)
3213 return ToDecomposed (c, DecompositionSmall, true);
3216 char ToDecomposed (char c, byte d, bool tail)
3218 if (decompType [(int) c] != d)
3220 int idx = decompIndex [(int) c];
3222 idx += decompLength [(int) c] - 1;
3223 return (char) decompValues [idx];
3226 bool ExistsJIS (int cp)
3228 foreach (JISCharacter j in jisJapanese)
3236 #region Level 3 properties (Case/Width)
3238 private byte ComputeLevel3Weight (char c)
3240 byte b = ComputeLevel3WeightRaw (c);
3241 return b > 0 ? (byte) (b + 2) : b;
3244 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3247 if ('\u3192' <= c && c <= '\u319F')
3249 // Japanese reading marks
3250 if (c == '\u3001' || c == '\u3002')
3253 if ('\u11A8' <= c && c <= '\u11F9')
3255 if ('\uFFA0' <= c && c <= '\uFFDC')
3257 if ('\u3130' <= c && c <= '\u3164')
3259 if ('\u3165' <= c && c <= '\u318E')
3261 // Georgian Capital letters
3262 if ('\u10A0' <= c && c <= '\u10C5')
3265 if ('\u2776' <= c && c <= '\u277F')
3267 if ('\u2780' <= c && c <= '\u2789')
3269 if ('\u2776' <= c && c <= '\u2793')
3271 if ('\u2160' <= c && c <= '\u216F')
3273 if ('\u2181' <= c && c <= '\u2182')
3276 if ('\u2135' <= c && c <= '\u2138')
3278 if ('\uFE80' <= c && c < '\uFF00') {
3279 // 2(Isolated)/8(Final)/0x18(Medial)
3280 switch (decompType [(int) c]) {
3281 case DecompositionIsolated:
3283 case DecompositionFinal:
3285 case DecompositionMedial:
3290 // actually I dunno the reason why they have weights.
3313 switch (decompType [(int) c]) {
3314 case DecompositionWide: // <wide>
3315 case DecompositionSub: // <sub>
3316 case DecompositionSuper: // <super>
3317 ret |= decompType [(int) c];
3320 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3322 if (isUppercase [(int) c]) // DerivedCoreProperties
3332 static bool IsIgnorable (int i)
3334 if (unicodeAge [i] >= 3.1)
3336 switch (char.GetUnicodeCategory ((char) i)) {
3337 case UnicodeCategory.OtherNotAssigned:
3338 case UnicodeCategory.Format:
3345 // FIXME: In the future use DerivedAge.txt to examine character
3346 // versions and set those ones that have higher version than
3347 // 1.0 as ignorable.
3348 static bool IsIgnorable (int i)
3352 // I guess, those characters are added between
3353 // Unicode 1.0 (LCMapString) and Unicode 3.1
3354 // (UnicodeCategory), so they used to be
3355 // something like OtherNotAssigned as of Unicode 1.1.
3356 case 0x2df: case 0x387:
3357 case 0x3d7: case 0x3d8: case 0x3d9:
3358 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3359 case 0x400: case 0x40d: case 0x450: case 0x45d:
3360 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3361 case 0x653: case 0x654: case 0x655: case 0x66d:
3363 case 0x1e9b: case 0x202f: case 0x20ad:
3364 case 0x20ae: case 0x20af:
3365 case 0x20e2: case 0x20e3:
3366 case 0x2139: case 0x213a: case 0x2183:
3367 case 0x2425: case 0x2426: case 0x2619:
3368 case 0x2670: case 0x2671: case 0x3007:
3369 case 0x3190: case 0x3191:
3370 case 0xfffc: case 0xfffd:
3372 // exceptional characters filtered by the
3373 // following conditions. Originally those exceptional
3374 // ranges are incorrect (they should not be ignored)
3375 // and most of those characters are unfortunately in
3377 case 0x4d8: case 0x4d9:
3378 case 0x4e8: case 0x4e9:
3380 case 0x3036: case 0x303f:
3381 case 0x337b: case 0xfb1e:
3386 // The whole Sinhala characters.
3387 0x0D82 <= i && i <= 0x0DF4
3388 // The whole Tibetan characters.
3389 || 0x0F00 <= i && i <= 0x0FD1
3390 // The whole Myanmar characters.
3391 || 0x1000 <= i && i <= 0x1059
3392 // The whole Etiopic, Cherokee,
3393 // Canadian Syllablic, Ogham, Runic,
3394 // Tagalog, Hanunoo, Philippine,
3395 // Buhid, Tagbanwa, Khmer and Mongorian
3397 || 0x1200 <= i && i <= 0x1DFF
3398 // Greek extension characters.
3399 || 0x1F00 <= i && i <= 0x1FFF
3400 // The whole Braille characters.
3401 || 0x2800 <= i && i <= 0x28FF
3402 // CJK radical characters.
3403 || 0x2E80 <= i && i <= 0x2EF3
3404 // Kangxi radical characters.
3405 || 0x2F00 <= i && i <= 0x2FD5
3406 // Ideographic description characters.
3407 || 0x2FF0 <= i && i <= 0x2FFB
3408 // Bopomofo letter and final
3409 || 0x31A0 <= i && i <= 0x31B7
3410 // White square with quadrant characters.
3411 || 0x25F0 <= i && i <= 0x25F7
3412 // Ideographic telegraph symbols.
3413 || 0x32C0 <= i && i <= 0x32CB
3414 || 0x3358 <= i && i <= 0x3370
3415 || 0x33E0 <= i && i <= 0x33FF
3416 // The whole YI characters.
3417 || 0xA000 <= i && i <= 0xA48C
3418 || 0xA490 <= i && i <= 0xA4C6
3419 // American small ligatures
3420 || 0xFB13 <= i && i <= 0xFB17
3421 // hebrew, arabic, variation selector.
3422 || 0xFB1D <= i && i <= 0xFE2F
3423 // Arabic ligatures.
3424 || 0xFEF5 <= i && i <= 0xFEFC
3425 // FIXME: why are they excluded?
3426 || 0x01F6 <= i && i <= 0x01F9
3427 || 0x0218 <= i && i <= 0x0233
3428 || 0x02A9 <= i && i <= 0x02AD
3429 || 0x02EA <= i && i <= 0x02EE
3430 || 0x0349 <= i && i <= 0x036F
3431 || 0x0488 <= i && i <= 0x048F
3432 || 0x04D0 <= i && i <= 0x04FF
3433 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3434 || 0x06D6 <= i && i <= 0x06ED
3435 || 0x06FA <= i && i <= 0x06FE
3436 || 0x2048 <= i && i <= 0x204D
3437 || 0x20e4 <= i && i <= 0x20ea
3438 || 0x213C <= i && i <= 0x214B
3439 || 0x21EB <= i && i <= 0x21FF
3440 || 0x22F2 <= i && i <= 0x22FF
3441 || 0x237B <= i && i <= 0x239A
3442 || 0x239B <= i && i <= 0x23CF
3443 || 0x24EB <= i && i <= 0x24FF
3444 || 0x2596 <= i && i <= 0x259F
3445 || 0x25F8 <= i && i <= 0x25FF
3446 || 0x2672 <= i && i <= 0x2689
3447 || 0x2768 <= i && i <= 0x2775
3448 || 0x27d0 <= i && i <= 0x27ff
3449 || 0x2900 <= i && i <= 0x2aff
3450 || 0x3033 <= i && i <= 0x303F
3451 || 0x31F0 <= i && i <= 0x31FF
3452 || 0x3250 <= i && i <= 0x325F
3453 || 0x32B1 <= i && i <= 0x32BF
3454 || 0x3371 <= i && i <= 0x337B
3455 || 0xFA30 <= i && i <= 0xFA6A
3459 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3461 case UnicodeCategory.PrivateUse:
3462 case UnicodeCategory.Surrogate:
3464 // ignored by nature
3465 case UnicodeCategory.Format:
3466 case UnicodeCategory.OtherNotAssigned:
3473 // To check IsIgnorable sanity, try the driver below under MS.NET.
3476 public static void Main ()
3478 for (int i = 0; i <= char.MaxValue; i++)
3479 Dump (i, IsIgnorable (i));
3482 static void Dump (int i, bool ignore)
3484 switch (Char.GetUnicodeCategory ((char) i)) {
3485 case UnicodeCategory.PrivateUse:
3486 case UnicodeCategory.Surrogate:
3487 return; // check nothing
3491 string s2 = new string ((char) i, 10);
3492 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3493 if ((ret == 0) == ignore)
3495 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3498 #endregion // IsIgnorable
3500 #region IsIgnorableSymbol
3501 static bool IsIgnorableSymbol (int i)
3503 if (IsIgnorable (i))
3508 case 0x00b5: case 0x01C0: case 0x01C1:
3509 case 0x01C2: case 0x01C3: case 0x01F6:
3510 case 0x01F7: case 0x01F8: case 0x01F9:
3511 case 0x02D0: case 0x02EE: case 0x037A:
3512 case 0x03D7: case 0x03F3:
3513 case 0x0400: case 0x040d:
3514 case 0x0450: case 0x045d:
3515 case 0x048C: case 0x048D:
3516 case 0x048E: case 0x048F:
3517 case 0x0587: case 0x0640: case 0x06E5:
3518 case 0x06E6: case 0x06FA: case 0x06FB:
3519 case 0x06FC: case 0x093D: case 0x0950:
3520 case 0x1E9B: case 0x2139: case 0x3006:
3521 case 0x3033: case 0x3034: case 0x3035:
3522 case 0xFE7E: case 0xFE7F:
3524 case 0x16EE: case 0x16EF: case 0x16F0:
3526 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3527 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3528 case 0x3038: // HANGZHOU NUMERAL TEN
3529 case 0x3039: // HANGZHOU NUMERAL TWENTY
3530 case 0x303a: // HANGZHOU NUMERAL THIRTY
3536 case 0x02B9: case 0x02BA: case 0x02C2:
3537 case 0x02C3: case 0x02C4: case 0x02C5:
3538 case 0x02C8: case 0x02CC: case 0x02CD:
3539 case 0x02CE: case 0x02CF: case 0x02D2:
3540 case 0x02D3: case 0x02D4: case 0x02D5:
3541 case 0x02D6: case 0x02D7: case 0x02DE:
3542 case 0x02E5: case 0x02E6: case 0x02E7:
3543 case 0x02E8: case 0x02E9:
3544 case 0x309B: case 0x309C:
3546 case 0x055A: // American Apos
3547 case 0x05C0: // Hebrew Punct
3548 case 0x0E4F: // Thai FONGMAN
3549 case 0x0E5A: // Thai ANGKHANKHU
3550 case 0x0E5B: // Thai KHOMUT
3552 case 0x09F2: // Bengali Rupee Mark
3553 case 0x09F3: // Bengali Rupee Sign
3555 case 0x221e: // INF.
3564 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3566 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3567 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3572 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3574 case UnicodeCategory.Surrogate:
3575 return false; // inconsistent
3577 case UnicodeCategory.SpacingCombiningMark:
3578 case UnicodeCategory.EnclosingMark:
3579 case UnicodeCategory.NonSpacingMark:
3580 case UnicodeCategory.PrivateUse:
3582 if (0x064B <= i && i <= 0x0652) // Arabic
3586 case UnicodeCategory.Format:
3587 case UnicodeCategory.OtherNotAssigned:
3594 // latin in a circle
3595 0x249A <= i && i <= 0x24E9
3596 || 0x2100 <= i && i <= 0x2132
3598 || 0x3196 <= i && i <= 0x31A0
3600 || 0x3200 <= i && i <= 0x321C
3602 || 0x322A <= i && i <= 0x3243
3604 || 0x3260 <= i && i <= 0x32B0
3605 || 0x32D0 <= i && i <= 0x3357
3606 || 0x337B <= i && i <= 0x33DD
3608 use = !Char.IsLetterOrDigit ((char) i);
3612 // This "Digit" rule is mystery.
3613 // It filters some symbols out.
3614 if (Char.IsLetterOrDigit ((char) i))
3616 if (Char.IsNumber ((char) i))
3618 if (Char.IsControl ((char) i)
3619 || Char.IsSeparator ((char) i)
3620 || Char.IsPunctuation ((char) i))
3622 if (Char.IsSymbol ((char) i))
3625 // FIXME: should check more
3630 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3632 public static void Main ()
3634 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3635 for (int i = 0; i <= char.MaxValue; i++) {
3636 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3637 if (uc == UnicodeCategory.Surrogate)
3640 bool ret = IsIgnorableSymbol (i);
3642 string s1 = "TEST ";
3643 string s2 = "TEST " + (char) i;
3645 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3647 if (ret != (result == 0))
3648 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3649 ret ? "should not ignore" :
3658 static bool IsIgnorableNonSpacing (int i)
3660 if (IsIgnorable (i))
3664 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3665 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3666 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3668 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3669 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3670 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3671 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3672 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3673 case 0x0CCD: case 0x0E4E:
3677 if (0x02b9 <= i && i <= 0x02c5
3678 || 0x02cc <= i && i <= 0x02d7
3679 || 0x02e4 <= i && i <= 0x02ef
3680 || 0x20DD <= i && i <= 0x20E0
3684 if (0x064B <= i && i <= 0x00652
3685 || 0x0941 <= i && i <= 0x0948
3686 || 0x0AC1 <= i && i <= 0x0ACD
3687 || 0x0C3E <= i && i <= 0x0C4F
3688 || 0x0E31 <= i && i <= 0x0E3F
3692 return Char.GetUnicodeCategory ((char) i) ==
3693 UnicodeCategory.NonSpacingMark;
3696 // We can reuse IsIgnorableSymbol testcode
3697 // for IsIgnorableNonSpacing.
3703 public byte Category;
3705 public byte Level2; // It is always single byte.
3706 public bool Defined;
3708 public CharMapEntry (byte category, byte level1, byte level2)
3710 Category = category;
3719 public readonly int CP;
3720 public readonly int JIS;
3722 public JISCharacter (int cp, int cpJIS)
3729 class JISComparer : IComparer
3731 public static readonly JISComparer Instance =
3734 public int Compare (object o1, object o2)
3736 JISCharacter j1 = (JISCharacter) o1;
3737 JISCharacter j2 = (JISCharacter) o2;
3738 return j1.JIS - j2.JIS;
3742 class NonJISCharacter
3744 public readonly int CP;
3745 public readonly string Name;
3747 public NonJISCharacter (int cp, string name)
3754 class NonJISComparer : IComparer
3756 public static readonly NonJISComparer Instance =
3757 new NonJISComparer ();
3759 public int Compare (object o1, object o2)
3761 NonJISCharacter j1 = (NonJISCharacter) o1;
3762 NonJISCharacter j2 = (NonJISCharacter) o2;
3763 return string.CompareOrdinal (j1.Name, j2.Name);
3767 class DecimalDictionaryValueComparer : IComparer
3769 public static readonly DecimalDictionaryValueComparer Instance
3770 = new DecimalDictionaryValueComparer ();
3772 private DecimalDictionaryValueComparer ()
3776 public int Compare (object o1, object o2)
3778 DictionaryEntry e1 = (DictionaryEntry) o1;
3779 DictionaryEntry e2 = (DictionaryEntry) o2;
3780 // FIXME: in case of 0, compare decomposition categories
3781 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3784 int i1 = (int) e1.Key;
3785 int i2 = (int) e2.Key;
3790 class StringDictionaryValueComparer : IComparer
3792 public static readonly StringDictionaryValueComparer Instance
3793 = new StringDictionaryValueComparer ();
3795 private StringDictionaryValueComparer ()
3799 public int Compare (object o1, object o2)
3801 DictionaryEntry e1 = (DictionaryEntry) o1;
3802 DictionaryEntry e2 = (DictionaryEntry) o2;
3803 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3806 int i1 = (int) e1.Key;
3807 int i2 = (int) e2.Key;
3812 class UCAComparer : IComparer
3814 public static readonly UCAComparer Instance
3815 = new UCAComparer ();
3817 private UCAComparer ()
3821 public int Compare (object o1, object o2)
3823 char i1 = (char) o1;
3824 char i2 = (char) o2;
3826 int l1 = CollationElementTable.GetSortKeyCount (i1);
3827 int l2 = CollationElementTable.GetSortKeyCount (i2);
3828 int l = l1 > l2 ? l2 : l1;
3830 for (int i = 0; i < l; i++) {
3831 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3832 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3833 int v = k1.Primary - k2.Primary;
3836 v = k1.Secondary - k2.Secondary;
3839 v = k1.Thirtiary - k2.Thirtiary;
3842 v = k1.Quarternary - k2.Quarternary;
3855 ArrayList items = new ArrayList ();
3857 public Tailoring (int lcid)
3862 public Tailoring (int lcid, int alias)
3869 get { return lcid; }
3873 get { return alias; }
3876 public bool FrenchSort {
3877 get { return frenchSort; }
3878 set { frenchSort = value; }
3881 public void AddDiacriticalMap (byte target, byte replace)
3883 items.Add (new DiacriticalMap (target, replace));
3886 public void AddSortKeyMap (string source, byte [] sortkey)
3888 items.Add (new SortKeyMap (source, sortkey));
3891 public void AddReplacementMap (string source, string replace)
3893 items.Add (new ReplacementMap (source, replace));
3896 public char [] ItemToCharArray ()
3898 ArrayList al = new ArrayList ();
3899 foreach (ITailoringMap m in items)
3900 al.AddRange (m.ToCharArray ());
3901 return al.ToArray (typeof (char)) as char [];
3904 interface ITailoringMap
3906 char [] ToCharArray ();
3909 class DiacriticalMap : ITailoringMap
3911 public readonly byte Target;
3912 public readonly byte Replace;
3914 public DiacriticalMap (byte target, byte replace)
3920 public char [] ToCharArray ()
3922 char [] ret = new char [3];
3923 ret [0] = (char) 02; // kind:DiacriticalMap
3924 ret [1] = (char) Target;
3925 ret [2] = (char) Replace;
3930 class SortKeyMap : ITailoringMap
3932 public readonly string Source;
3933 public readonly byte [] SortKey;
3935 public SortKeyMap (string source, byte [] sortkey)
3941 public char [] ToCharArray ()
3943 char [] ret = new char [Source.Length + 7];
3944 ret [0] = (char) 01; // kind:SortKeyMap
3945 for (int i = 0; i < Source.Length; i++)
3946 ret [i + 1] = Source [i];
3948 for (int i = 0; i < 4; i++)
3949 ret [i + Source.Length + 2] = (char) SortKey [i];
3954 class ReplacementMap : ITailoringMap
3956 public readonly string Source;
3957 public readonly string Replace;
3959 public ReplacementMap (string source, string replace)
3965 public char [] ToCharArray ()
3967 char [] ret = new char [Source.Length + Replace.Length + 3];
3968 ret [0] = (char) 03; // kind:ReplaceMap
3970 for (int i = 0; i < Source.Length; i++)
3971 ret [pos++] = Source [i];
3974 for (int i = 0; i < Replace.Length; i++)
3975 ret [pos++] = Replace [i];