3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
100 "WITH VERTICAL LINE ABOVE;",
101 "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
102 "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
103 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
104 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
105 "WITH OGONEK;", "WITH CEDILLA;",
107 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
108 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
110 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
111 " DIAERESIS AND GRAVE;",
113 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
114 " MACRON AND ACUTE;",
115 " MACRON AND GRAVE;",
117 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
118 " RING ABOVE AND ACUTE",
119 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
120 " CIRCUMFLEX AND TILDE",
121 " TILDE AND DIAERESIS",
124 " CEDILLA AND BREVE",
125 " OGONEK AND MACRON",
128 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
132 " PRECEDED BY APOSTROPHE",
134 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
137 " RETROFLEX;", "DIAERESIS BELOW",
140 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
141 " BREVE BELOW;", " HORN AND GRAVE",
144 " DOT BELOW AND DOT ABOVE",
145 " RIGHT HALF RING", " HORN AND TILDE",
146 " CIRCUMFLEX AND DOT BELOW",
147 " BREVE AND DOT BELOW",
148 " DOT BELOW AND MACRON",
149 " HORN AND HOOK ABOVE",
151 // CIRCLED, PARENTHESIZED and so on
152 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
153 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
154 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
156 byte [] diacriticWeights = new byte [] {
160 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
161 0x17, 0x19, 0x1A, 0x1B, 0x1C,
163 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
164 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
166 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
167 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
169 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
170 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
172 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
173 0x69, 0x69, 0x6A, 0x6D, 0x6E,
175 // CIRCLED, PARENTHESIZED and so on.
176 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
180 int [] numberSecondaryWeightBounds = new int [] {
181 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
182 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
183 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
184 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
185 0xE50, 0xE60, 0xED0, 0xEE0
188 char [] orderedCyrillic;
189 char [] orderedGurmukhi;
190 char [] orderedGujarati;
191 char [] orderedGeorgian;
192 char [] orderedThaana;
194 static readonly char [] orderedTamilConsonants = new char [] {
195 // based on traditional Tamil consonants, except for
196 // Grantha (where Microsoft breaks traditionalism).
197 // http://www.angelfire.com/empire/thamizh/padanGaL
198 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
199 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
200 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
201 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
204 // cp -> character name (only for some characters)
205 ArrayList sortableCharNames = new ArrayList ();
207 // cp -> arrow value (int)
208 ArrayList arrowValues = new ArrayList ();
210 // cp -> box value (int)
211 ArrayList boxValues = new ArrayList ();
213 // cp -> level1 value
214 Hashtable arabicLetterPrimaryValues = new Hashtable ();
215 Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
218 Hashtable arabicNameMap = new Hashtable ();
219 Hashtable cyrillicNameMap = new Hashtable ();
221 // cp -> Hashtable [decompType] -> cp
222 Hashtable nfkdMap = new Hashtable ();
224 // Latin letter -> ArrayList [int]
225 Hashtable latinMap = new Hashtable ();
227 ArrayList jisJapanese = new ArrayList ();
228 ArrayList nonJisJapanese = new ArrayList ();
230 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
231 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
232 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
233 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
234 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
236 byte [] ignorableFlags = new byte [char.MaxValue + 1];
238 static double [] unicodeAge = new double [char.MaxValue + 1];
240 ArrayList tailorings = new ArrayList ();
242 void Run (string [] args)
244 string dirname = args.Length == 0 ? "downloaded" : args [0];
245 ParseSources (dirname);
246 Console.Error.WriteLine ("parse done.");
248 ModifyParsedValues ();
250 Console.Error.WriteLine ("generation done.");
252 Console.Error.WriteLine ("serialization done.");
254 StreamWriter sw = new StreamWriter ("agelog.txt");
255 for (int i = 0; i < char.MaxValue; i++) {
256 bool shouldBe = false;
257 switch (Char.GetUnicodeCategory ((char) i)) {
258 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
259 shouldBe = true; break;
261 if (unicodeAge [i] >= 3.1)
263 //if (IsIgnorable (i) != shouldBe)
264 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
270 byte [] CompressArray (byte [] source, CodePointIndexer i)
272 return (byte []) CodePointIndexer.CompressArray (
273 source, typeof (byte), i);
276 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
278 return (ushort []) CodePointIndexer.CompressArray (
279 source, typeof (ushort), i);
285 SerializeTailorings ();
287 byte [] categories = new byte [map.Length];
288 byte [] level1 = new byte [map.Length];
289 byte [] level2 = new byte [map.Length];
290 byte [] level3 = new byte [map.Length];
291 ushort [] widthCompat = new ushort [map.Length];
292 for (int i = 0; i < map.Length; i++) {
293 categories [i] = map [i].Category;
294 level1 [i] = map [i].Level1;
295 level2 [i] = map [i].Level2;
296 level3 [i] = ComputeLevel3Weight ((char) i);
297 switch (decompType [i]) {
298 case DecompositionNarrow:
299 case DecompositionWide:
300 case DecompositionSuper:
301 case DecompositionSub:
302 // they are always 1 char
303 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
309 ignorableFlags = CompressArray (ignorableFlags,
310 MSCompatUnicodeTableUtil.Ignorable);
311 categories = CompressArray (categories,
312 MSCompatUnicodeTableUtil.Category);
313 level1 = CompressArray (level1,
314 MSCompatUnicodeTableUtil.Level1);
315 level2 = CompressArray (level2,
316 MSCompatUnicodeTableUtil.Level2);
317 level3 = CompressArray (level3,
318 MSCompatUnicodeTableUtil.Level3);
319 widthCompat = (ushort []) CodePointIndexer.CompressArray (
320 widthCompat, typeof (ushort),
321 MSCompatUnicodeTableUtil.WidthCompat);
322 cjkCHS = CompressArray (cjkCHS,
323 MSCompatUnicodeTableUtil.CjkCHS);
324 cjkCHT = CompressArray (cjkCHT,
325 MSCompatUnicodeTableUtil.Cjk);
326 cjkJA = CompressArray (cjkJA,
327 MSCompatUnicodeTableUtil.Cjk);
328 cjkKO = CompressArray (cjkKO,
329 MSCompatUnicodeTableUtil.Cjk);
330 cjkKOlv2 = CompressArray (cjkKOlv2,
331 MSCompatUnicodeTableUtil.Cjk);
334 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
336 MemoryStream ms = new MemoryStream ();
337 BinaryWriter binary = new BinaryWriter (ms);
338 binary.Write (ignorableFlags.Length);
340 for (int i = 0; i < ignorableFlags.Length; i++) {
341 byte value = ignorableFlags [i];
343 Result.Write ("{0},", value);
345 Result.Write ("0x{0:X02},", value);
347 binary.Write (value);
349 if ((i & 0xF) == 0xF)
350 Result.WriteLine ("// {0:X04}", i - 0xF);
352 Result.WriteLine ("};");
356 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
358 binary.Write (categories.Length);
360 for (int i = 0; i < categories.Length; i++) {
361 byte value = categories [i];
363 Result.Write ("{0},", value);
365 Result.Write ("0x{0:X02},", value);
367 binary.Write (value);
369 if ((i & 0xF) == 0xF)
370 Result.WriteLine ("// {0:X04}", i - 0xF);
372 Result.WriteLine ("};");
375 // Primary weight value
376 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
378 binary.Write (level1.Length);
380 for (int i = 0; i < level1.Length; i++) {
381 byte value = level1 [i];
383 Result.Write ("{0},", value);
385 Result.Write ("0x{0:X02},", value);
387 binary.Write (value);
389 if ((i & 0xF) == 0xF)
390 Result.WriteLine ("// {0:X04}", i - 0xF);
392 Result.WriteLine ("};");
396 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
398 binary.Write (level2.Length);
400 for (int i = 0; i < level2.Length; i++) {
401 byte value = level2 [i];
403 Result.Write ("{0},", value);
405 Result.Write ("0x{0:X02},", value);
407 binary.Write (value);
409 if ((i & 0xF) == 0xF)
410 Result.WriteLine ("// {0:X04}", i - 0xF);
412 Result.WriteLine ("};");
416 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
418 binary.Write (level3.Length);
420 for (int i = 0; i < level3.Length; i++) {
421 byte value = level3 [i];
423 Result.Write ("{0},", value);
425 Result.Write ("0x{0:X02},", value);
427 binary.Write (value);
429 if ((i & 0xF) == 0xF)
430 Result.WriteLine ("// {0:X04}", i - 0xF);
432 Result.WriteLine ("};");
435 // Width insensitivity mappings
436 // (for now it is more lightweight than dumping the
437 // entire NFKD table).
438 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
440 binary.Write (widthCompat.Length);
442 for (int i = 0; i < widthCompat.Length; i++) {
443 ushort value = widthCompat [i];
445 Result.Write ("{0},", value);
447 Result.Write ("0x{0:X02},", value);
449 binary.Write (value);
451 if ((i & 0xF) == 0xF)
452 Result.WriteLine ("// {0:X04}", i - 0xF);
454 Result.WriteLine ("};");
457 using (FileStream fs = File.Create ("../collation.core.bin")) {
458 byte [] array = ms.ToArray ();
459 fs.Write (array, 0, array.Length);
464 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
465 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
466 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
467 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
468 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
471 void SerializeCJK (string name, ushort [] cjk, int max)
473 int offset = 0;//char.MaxValue - cjk.Length;
474 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
476 MemoryStream ms = new MemoryStream ();
477 BinaryWriter binary = new BinaryWriter (ms);
479 for (int i = 0; i < cjk.Length; i++) {
480 if (i + offset == max)
482 ushort value = cjk [i];
484 Result.Write ("{0},", value);
486 Result.Write ("0x{0:X04},", value);
488 binary.Write (value);
490 if ((i & 0xF) == 0xF)
491 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
493 Result.WriteLine ("};");
496 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
497 byte [] array = ms.ToArray ();
498 fs.Write (array, 0, array.Length);
503 void SerializeCJK (string name, byte [] cjk, int max)
505 int offset = 0;//char.MaxValue - cjk.Length;
506 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
508 MemoryStream ms = new MemoryStream ();
509 BinaryWriter binary = new BinaryWriter (ms);
511 for (int i = 0; i < cjk.Length; i++) {
512 if (i + offset == max)
514 byte value = cjk [i];
516 Result.Write ("{0},", value);
518 Result.Write ("0x{0:X02},", value);
520 binary.Write (value);
522 if ((i & 0xF) == 0xF)
523 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
525 Result.WriteLine ("};");
528 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
529 byte [] array = ms.ToArray ();
530 fs.Write (array, 0, array.Length);
535 void SerializeTailorings ()
537 Hashtable indexes = new Hashtable ();
538 Hashtable counts = new Hashtable ();
539 Result.WriteLine ("static char [] tailorings = new char [] {");
542 MemoryStream ms = new MemoryStream ();
543 BinaryWriter binary = new BinaryWriter (ms);
545 foreach (Tailoring t in tailorings) {
548 Result.Write ("/*{0}*/", t.LCID);
549 indexes.Add (t.LCID, count);
550 char [] values = t.ItemToCharArray ();
551 counts.Add (t.LCID, values.Length);
552 foreach (char c in values) {
553 Result.Write ("'\\x{0:X}', ", (int) c);
554 if (++count % 16 == 0)
555 Result.WriteLine (" // {0:X04}", count - 16);
557 binary.Write ((ushort) c);
561 Result.WriteLine ("};");
563 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
565 byte [] rawdata = ms.ToArray ();
566 ms = new MemoryStream ();
567 binary = new BinaryWriter (ms);
568 binary.Write (tailorings.Count);
570 foreach (Tailoring t in tailorings) {
571 int target = t.Alias != 0 ? t.Alias : t.LCID;
572 if (!indexes.ContainsKey (target)) {
573 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
576 int idx = (int) indexes [target];
577 int cnt = (int) counts [target];
578 bool french = t.FrenchSort;
580 foreach (Tailoring t2 in tailorings)
581 if (t2.LCID == t.LCID)
582 french = t2.FrenchSort;
583 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
585 binary.Write (t.LCID);
588 binary.Write (french);
591 Result.WriteLine ("};");
593 binary.Write ((byte) 0xFF);
594 binary.Write ((byte) 0xFF);
595 binary.Write (rawdata.Length / 2);
596 binary.Write (rawdata, 0, rawdata.Length);
599 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
600 byte [] array = ms.ToArray ();
601 fs.Write (array, 0, array.Length);
608 void ParseSources (string dirname)
611 dirname + "/UnicodeData.txt";
612 string derivedCoreProps =
613 dirname + "/DerivedCoreProperties.txt";
615 dirname + "/Scripts.txt";
617 dirname + "/CP932.TXT";
619 dirname + "/DerivedAge.txt";
620 string chXML = dirname + "/common/collation/zh.xml";
621 string jaXML = dirname + "/common/collation/ja.xml";
622 string koXML = dirname + "/common/collation/ko.xml";
624 ParseDerivedAge (derivedAge);
628 ParseJISOrder (cp932); // in prior to ParseUnidata()
629 ParseUnidata (unidata);
630 ParseDerivedCoreProperties (derivedCoreProps);
631 ParseScripts (scripts);
632 ParseCJK (chXML, jaXML, koXML);
634 ParseTailorings ("mono-tailoring-source.txt");
637 void ParseTailorings (string filename)
641 using (StreamReader sr = new StreamReader (filename)) {
643 while (sr.Peek () >= 0) {
645 ProcessTailoringLine (ref t,
646 sr.ReadLine ().Trim ());
648 } catch (Exception) {
649 Console.Error.WriteLine ("ERROR at line {0}", line);
655 // For now this is enough.
656 string ParseTailoringSourceValue (string s)
658 StringBuilder sb = new StringBuilder ();
659 for (int i = 0; i < s.Length; i++) {
660 if (s.StartsWith ("\\u")) {
661 sb.Append ((char) int.Parse (
662 s.Substring (2, 4), NumberStyles.HexNumber),
669 return sb.ToString ();
672 void ProcessTailoringLine (ref Tailoring t, string s)
674 int idx = s.IndexOf ('#');
676 s = s.Substring (0, idx).Trim ();
677 if (s.Length == 0 || s [0] == '#')
680 idx = s.IndexOf ('=');
683 int.Parse (s.Substring (1, idx - 1)),
684 int.Parse (s.Substring (idx + 1)));
686 t = new Tailoring (int.Parse (s.Substring (1)));
690 if (s.StartsWith ("*FrenchSort")) {
694 string d = "*Diacritical";
695 if (s.StartsWith (d)) {
696 idx = s.IndexOf ("->");
697 t.AddDiacriticalMap (
698 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
699 NumberStyles.HexNumber),
700 byte.Parse (s.Substring (idx + 2).Trim (),
701 NumberStyles.HexNumber));
704 idx = s.IndexOf (':');
706 string source = s.Substring (0, idx).Trim ();
707 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
708 byte [] b = new byte [4];
709 for (int i = 0; i < 4; i++) {
713 b [i] = byte.Parse (l [i],
714 NumberStyles.HexNumber);
716 t.AddSortKeyMap (ParseTailoringSourceValue (source),
719 idx = s.IndexOf ('=');
721 t.AddReplacementMap (
722 ParseTailoringSourceValue (
723 s.Substring (0, idx).Trim ()),
724 ParseTailoringSourceValue (
725 s.Substring (idx + 1).Trim ()));
728 void ParseDerivedAge (string filename)
730 using (StreamReader file =
731 new StreamReader (filename)) {
732 while (file.Peek () >= 0) {
733 string s = file.ReadLine ();
734 int idx = s.IndexOf ('#');
736 s = s.Substring (0, idx);
737 idx = s.IndexOf (';');
741 string cpspec = s.Substring (0, idx);
742 idx = cpspec.IndexOf ("..");
743 NumberStyles nf = NumberStyles.HexNumber |
744 NumberStyles.AllowTrailingWhite;
745 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
746 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
747 string value = s.Substring (cpspec.Length + 1).Trim ();
750 if (cp > char.MaxValue)
753 double v = double.Parse (value);
754 for (int i = cp; i <= cpEnd; i++)
758 unicodeAge [0] = double.MaxValue; // never be supported
761 void ParseUnidata (string filename)
763 ArrayList decompValues = new ArrayList ();
764 using (StreamReader unidata =
765 new StreamReader (filename)) {
766 for (int line = 1; unidata.Peek () >= 0; line++) {
768 ProcessUnidataLine (unidata.ReadLine (), decompValues);
769 } catch (Exception) {
770 Console.Error.WriteLine ("**** At line " + line);
775 this.decompValues = (int [])
776 decompValues.ToArray (typeof (int));
779 char previousLatinTarget = char.MinValue;
780 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
782 void ProcessUnidataLine (string s, ArrayList decompValues)
784 int idx = s.IndexOf ('#');
786 s = s.Substring (0, idx);
787 idx = s.IndexOf (';');
790 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
791 string [] values = s.Substring (idx + 1).Split (';');
794 if (cp > char.MaxValue)
796 if (IsIgnorable (cp))
799 string name = values [0];
801 // SPECIAL CASE: rename some characters for diacritical
802 // remapping. FIXME: why are they different?
803 // FIXME: it's still not working.
804 if (cp == 0x018B || cp == 0x018C)
805 name = name.Replace ("TOPBAR", "STROKE");
808 if (s.IndexOf ("SMALL CAPITAL") > 0)
809 isSmallCapital [cp] = true;
811 // latin mapping by character name
812 if (s.IndexOf ("LATIN") >= 0) {
813 int lidx = s.IndexOf ("LETTER DOTLESS ");
814 int offset = lidx + 15;
816 lidx = s.IndexOf ("LETTER TURNED ");
820 lidx = s.IndexOf ("LETTER CAPITAL ");
824 lidx = s.IndexOf ("LETTER SCRIPT ");
828 lidx = s.IndexOf ("LETTER ");
831 char c = lidx > 0 ? s [offset] : char.MinValue;
832 char n = s [offset + 1];
833 char target = char.MinValue;
834 if ('A' <= c && c <= 'Z' &&
835 (n == ' ') || n == ';') {
837 // FIXME: After 'Z', I cannot reset this state.
838 previousLatinTarget = c == 'Z' ? char.MinValue : c;
841 if (s.Substring (offset).StartsWith ("ALPHA"))
843 else if (s.Substring (offset).StartsWith ("TONE SIX"))
845 else if (s.Substring (offset).StartsWith ("OPEN O"))
847 else if (s.Substring (offset).StartsWith ("SCHWA"))
849 else if (s.Substring (offset).StartsWith ("ENG"))
851 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
853 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
855 else if (s.Substring (offset).StartsWith ("TONE TWO"))
857 else if (s.Substring (offset).StartsWith ("ESH"))
860 if (target == char.MinValue)
861 target = previousLatinTarget;
863 if (target != char.MinValue) {
864 ArrayList entry = (ArrayList) latinMap [target];
866 entry = new ArrayList ();
867 latinMap [target] = entry;
870 // FIXME: This secondary weight is hack.
871 // They are here because they must not
872 // be identical to the corresponding
874 if (c != target && diacritical [cp] == 0) {
875 diacriticalOffset [c - 'A']++;
876 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
882 if (0x2000 <= cp && cp < 0x3000) {
884 // SPECIAL CASES. FIXME: why?
886 case 0x21C5: value = -1; break; // E2
887 case 0x261D: value = 1; break;
888 case 0x27A6: value = 3; break;
889 case 0x21B0: value = 7; break;
890 case 0x21B1: value = 3; break;
891 case 0x21B2: value = 7; break;
892 case 0x21B4: value = 5; break;
893 case 0x21B5: value = 7; break;
894 case 0x21B9: value = -1; break; // E1
895 case 0x21CF: value = 7; break;
896 case 0x21D0: value = 3; break;
898 string [] arrowTargets = new string [] {
910 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
911 if (s.IndexOf (arrowTargets [i]) > 0 &&
912 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
913 s.IndexOf (" OVER") < 0
917 arrowValues.Add (new DictionaryEntry (
922 if (0x2500 <= cp && cp < 0x2600) {
925 // up:1 down:2 right:4 left:8 vert:16 horiz:32
928 // [dr] [dl] [ur] [ul]
932 ArrayList flags = new ArrayList (new int [] {
935 4 + 2, 8 + 2, 4 + 1, 8 + 1,
936 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
937 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
938 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
940 byte [] offsets = new byte [] {
947 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
949 if (s.IndexOf (" UP") >= 0)
951 if (s.IndexOf (" DOWN") >= 0)
953 if (s.IndexOf (" RIGHT") >= 0)
955 if (s.IndexOf (" LEFT") >= 0)
957 if (s.IndexOf (" VERTICAL") >= 0)
959 if (s.IndexOf (" HORIZONTAL") >= 0)
962 int fidx = flags.IndexOf (flag);
963 value = fidx < 0 ? fidx : offsets [fidx];
964 } else if (s.IndexOf ("BLOCK") >= 0) {
965 if (s.IndexOf ("ONE EIGHTH") >= 0)
967 else if (s.IndexOf ("ONE QUARTER") >= 0)
969 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
971 else if (s.IndexOf ("HALF") >= 0)
973 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
975 else if (s.IndexOf ("THREE QUARTERS") >= 0)
977 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
982 else if (s.IndexOf ("SHADE") >= 0)
984 else if (s.IndexOf ("SQUARE") >= 0)
986 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
988 else if (s.IndexOf ("RECTANGLE") >= 0)
990 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
992 else if (s.IndexOf ("TRIANGLE") >= 0) {
993 if (s.IndexOf ("UP-POINTING") >= 0)
995 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
997 else if (s.IndexOf ("DOWN-POINTING") >= 0)
999 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1000 value = 0xC3 - 0xE5;
1002 else if (s.IndexOf ("POINTER") >= 0) {
1003 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1004 value = 0xC4 - 0xE5;
1005 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1006 value = 0xC5 - 0xE5;
1008 else if (s.IndexOf ("DIAMOND") >= 0)
1009 value = 0xC6 - 0xE5;
1010 else if (s.IndexOf ("FISHEYE") >= 0)
1011 value = 0xC7 - 0xE5;
1012 else if (s.IndexOf ("LOZENGE") >= 0)
1013 value = 0xC8 - 0xE5;
1014 else if (s.IndexOf ("BULLSEYE") >= 0)
1015 value = 0xC9 - 0xE5;
1016 else if (s.IndexOf ("CIRCLE") >= 0) {
1017 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1018 value = 0xCA - 0xE5;
1019 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1020 value = 0xCB - 0xE5;
1022 value = 0xC9 - 0xE5;
1024 if (0x25DA <= cp && cp <= 0x25E5)
1025 value = 0xCD + cp - 0x25DA - 0xE5;
1027 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1029 case 0x2571: value = 0xF; break;
1030 case 0x2572: value = 0x10; break;
1031 case 0x2573: value = 0x11; break;
1034 boxValues.Add (new DictionaryEntry (
1038 // For some characters store the name and sort later
1039 // to determine sorting.
1040 if (0x2100 <= cp && cp <= 0x213F &&
1041 Char.IsSymbol ((char) cp))
1042 sortableCharNames.Add (
1043 new DictionaryEntry (cp, name));
1044 else if (0x3380 <= cp && cp <= 0x33DD)
1045 sortableCharNames.Add (new DictionaryEntry (
1046 cp, name.Substring (7)));
1048 // diacritical weights by character name
1049 if (diacritics.Length != diacriticWeights.Length)
1050 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1051 for (int d = 0; d < diacritics.Length; d++) {
1052 if (s.IndexOf (diacritics [d]) > 0) {
1053 diacritical [cp] += diacriticWeights [d];
1054 if (s.IndexOf ("COMBINING") >= 0)
1055 diacritical [cp] -= (byte) 2;
1058 // also process "COMBINING blah" here
1059 // For now it is limited to cp < 0x0370
1060 // if (cp < 0x0300 || cp >= 0x0370)
1062 string tmp = diacritics [d].TrimEnd (';');
1063 if (tmp.IndexOf ("WITH ") == 0)
1064 tmp = tmp.Substring (4);
1065 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1067 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1069 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1071 // Two-step grep required for it.
1072 if (s.IndexOf ("FULL STOP") > 0 &&
1073 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1074 diacritical [cp] |= 0xF4;
1076 // Cyrillic letter name
1077 if (0x0430 <= cp && cp <= 0x0486 &&
1078 Char.IsLetter ((char) cp)) {
1079 byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
1080 // Get primary letter name i.e.
1081 // XXX part of CYRILLIC LETTER XXX yyy
1082 // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
1084 name.Substring (name.IndexOf ("LETTER ") + 7);
1085 int tmpIdx = letterName.IndexOf (' ');
1086 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1087 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1088 if (cyrillicNameMap.ContainsKey (letterName))
1089 value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
1091 cyrillicNameMap [letterName] = cp;
1093 cyrillicLetterPrimaryValues [cp] = value;
1096 // Arabic letter name
1097 if (0x0621 <= cp && cp <= 0x064A &&
1098 Char.GetUnicodeCategory ((char) cp)
1099 == UnicodeCategory.OtherLetter) {
1100 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1105 // hamza, waw, yeh ... special cases.
1110 value = 0x77; // special cases.
1113 // Get primary letter name i.e.
1114 // XXX part of ARABIC LETTER XXX yyy
1115 // e.g. that of "TEH MARBUTA" is "TEH".
1118 // 0x0640 is special: it does
1119 // not start with ARABIC LETTER
1121 name.Substring (14);
1122 int tmpIdx = letterName.IndexOf (' ');
1123 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1124 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1125 if (arabicNameMap.ContainsKey (letterName))
1126 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1128 arabicNameMap [letterName] = cp;
1131 arabicLetterPrimaryValues [cp] = value;
1134 // Japanese square letter
1135 if (0x3300 <= cp && cp <= 0x3357)
1136 if (!ExistsJIS (cp))
1137 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1139 // normalizationType
1140 string decomp = values [4];
1141 idx = decomp.IndexOf ('<');
1143 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1145 decompType [cp] = DecompositionFull;
1148 decompType [cp] = DecompositionSub;
1151 decompType [cp] = DecompositionSuper;
1154 decompType [cp] = DecompositionSmall;
1157 decompType [cp] = DecompositionIsolated;
1160 decompType [cp] = DecompositionInitial;
1163 decompType [cp] = DecompositionFinal;
1166 decompType [cp] = DecompositionMedial;
1169 decompType [cp] = DecompositionNoBreak;
1172 decompType [cp] = DecompositionCompat;
1175 decompType [cp] = DecompositionFraction;
1178 decompType [cp] = DecompositionFont;
1181 decompType [cp] = DecompositionCircle;
1184 decompType [cp] = DecompositionSquare;
1187 decompType [cp] = DecompositionWide;
1190 decompType [cp] = DecompositionNarrow;
1193 decompType [cp] = DecompositionVertical;
1196 throw new Exception ("Support NFKD type : " + decomp);
1200 decompType [cp] = DecompositionCanonical;
1201 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1202 if (decomp.Length > 0) {
1204 string [] velems = decomp.Split (' ');
1205 int didx = decompValues.Count;
1206 decompIndex [cp] = didx;
1207 foreach (string v in velems)
1208 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1209 decompLength [cp] = velems.Length;
1211 // [decmpType] -> this_cp
1212 int targetCP = (int) decompValues [didx];
1213 // for "(x)" it specially maps to 'x' .
1214 // FIXME: check if it is sane
1215 if (velems.Length == 3 &&
1216 (int) decompValues [didx] == '(' &&
1217 (int) decompValues [didx + 2] == ')')
1218 targetCP = (int) decompValues [didx + 1];
1219 // special: 0x215F "1/"
1220 else if (cp == 0x215F)
1222 else if (velems.Length > 1 &&
1223 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1224 // skip them, except for CJK ideograph compat
1227 if (targetCP != 0) {
1228 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1229 if (entry == null) {
1230 entry = new Hashtable ();
1231 nfkdMap [targetCP] = entry;
1233 entry [(byte) decompType [cp]] = cp;
1237 if (values [5].Length > 0)
1238 decimalValue [cp] = decimal.Parse (values [5]);
1239 else if (values [6].Length > 0)
1240 decimalValue [cp] = decimal.Parse (values [6]);
1241 else if (values [7].Length > 0) {
1242 string decstr = values [7];
1243 idx = decstr.IndexOf ('/');
1244 if (cp == 0x215F) // special. "1/"
1245 decimalValue [cp] = 0x1;
1249 decimal.Parse (decstr.Substring (0, idx))
1250 / decimal.Parse (decstr.Substring (idx + 1));
1251 else if (decstr [0] == '(' &&
1252 decstr [decstr.Length - 1] == ')')
1255 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1256 else if (decstr [decstr.Length - 1] == '.')
1259 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1261 decimalValue [cp] = decimal.Parse (decstr);
1265 void ParseDerivedCoreProperties (string filename)
1268 using (StreamReader file =
1269 new StreamReader (filename)) {
1270 for (int line = 1; file.Peek () >= 0; line++) {
1272 ProcessDerivedCorePropLine (file.ReadLine ());
1273 } catch (Exception) {
1274 Console.Error.WriteLine ("**** At line " + line);
1281 void ProcessDerivedCorePropLine (string s)
1283 int idx = s.IndexOf ('#');
1285 s = s.Substring (0, idx);
1286 idx = s.IndexOf (';');
1289 string cpspec = s.Substring (0, idx);
1290 idx = cpspec.IndexOf ("..");
1291 NumberStyles nf = NumberStyles.HexNumber |
1292 NumberStyles.AllowTrailingWhite;
1293 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1294 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1295 string value = s.Substring (cpspec.Length + 1).Trim ();
1298 if (cp > char.MaxValue)
1303 for (int x = cp; x <= cpEnd; x++)
1304 isUppercase [x] = true;
1309 void ParseScripts (string filename)
1311 ArrayList cyrillic = new ArrayList ();
1312 ArrayList gurmukhi = new ArrayList ();
1313 ArrayList gujarati = new ArrayList ();
1314 ArrayList georgian = new ArrayList ();
1315 ArrayList thaana = new ArrayList ();
1317 using (StreamReader file =
1318 new StreamReader (filename)) {
1319 while (file.Peek () >= 0) {
1320 string s = file.ReadLine ();
1321 int idx = s.IndexOf ('#');
1323 s = s.Substring (0, idx);
1324 idx = s.IndexOf (';');
1328 string cpspec = s.Substring (0, idx);
1329 idx = cpspec.IndexOf ("..");
1330 NumberStyles nf = NumberStyles.HexNumber |
1331 NumberStyles.AllowTrailingWhite;
1332 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1333 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1334 string value = s.Substring (cpspec.Length + 1).Trim ();
1337 if (cp > char.MaxValue)
1342 for (int x = cp; x <= cpEnd; x++)
1343 if (!IsIgnorable (x))
1344 cyrillic.Add ((char) x);
1347 for (int x = cp; x <= cpEnd; x++)
1348 if (!IsIgnorable (x))
1349 gurmukhi.Add ((char) x);
1352 for (int x = cp; x <= cpEnd; x++)
1353 if (!IsIgnorable (x))
1354 gujarati.Add ((char) x);
1357 for (int x = cp; x <= cpEnd; x++)
1358 if (!IsIgnorable (x))
1359 georgian.Add ((char) x);
1362 for (int x = cp; x <= cpEnd; x++)
1363 if (!IsIgnorable (x))
1364 thaana.Add ((char) x);
1369 cyrillic.Sort (UCAComparer.Instance);
1370 gurmukhi.Sort (UCAComparer.Instance);
1371 gujarati.Sort (UCAComparer.Instance);
1372 georgian.Sort (UCAComparer.Instance);
1373 thaana.Sort (UCAComparer.Instance);
1374 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1375 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1376 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1377 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1378 orderedThaana = (char []) thaana.ToArray (typeof (char));
1381 void ParseJISOrder (string filename)
1385 using (StreamReader file =
1386 new StreamReader (filename)) {
1387 for (;file.Peek () >= 0; line++)
1388 ProcessJISOrderLine (file.ReadLine ());
1390 } catch (Exception) {
1391 Console.Error.WriteLine ("---- line {0}", line);
1396 char [] ws = new char [] {'\t', ' '};
1398 void ProcessJISOrderLine (string s)
1400 int idx = s.IndexOf ('#');
1402 s = s.Substring (0, idx).Trim ();
1405 idx = s.IndexOfAny (ws);
1408 // They start with "0x" so cut them out.
1409 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1410 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1411 jisJapanese.Add (new JISCharacter (cp, jis));
1414 void ParseCJK (string zhXML, string jaXML, string koXML)
1416 XmlDocument doc = new XmlDocument ();
1417 doc.XmlResolver = null;
1424 // Chinese Simplified
1427 offset = 0;//char.MaxValue - arr.Length;
1429 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1431 foreach (char c in s) {
1433 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1435 arr [(int) c - offset] = (ushort) v++;
1441 // Chinese Traditional
1444 offset = 0;//char.MaxValue - arr.Length;
1445 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1447 foreach (char c in s) {
1449 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1451 arr [(int) c - offset] = (ushort) v++;
1460 offset = 0;//char.MaxValue - arr.Length;
1462 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1464 foreach (char c in s) {
1466 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1468 arr [(int) c - offset] = (ushort) v++;
1475 // Korean weight is somewhat complex. It first shifts
1476 // Hangul category from 52-x to 80-x (they are anyways
1477 // computed). CJK ideographs are placed at secondary
1478 // weight, like XX YY 01 zz 01, where XX and YY are
1479 // corresponding "reset" value and zz is 41,43,45...
1481 // Unlike chs,cht and ja, Korean value is a combined
1482 // ushort which is computed as category
1486 offset = 0;//char.MaxValue - arr.Length;
1488 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1489 XmlElement sc = (XmlElement) reset.NextSibling;
1490 // compute "category" and "level 1" for the
1491 // target "reset" Hangle syllable
1492 char rc = reset.InnerText [0];
1493 int ri = ((int) rc - 0xAC00) + 1;
1495 ((ri / 254) * 256 + (ri % 254) + 2);
1496 // Place the characters after the target.
1499 foreach (char c in s) {
1500 arr [(int) c - offset] = p;
1501 cjkKOlv2 [(int) c - offset] = (byte) v;
1511 void FillIgnorables ()
1513 for (int i = 0; i <= char.MaxValue; i++) {
1514 if (Char.GetUnicodeCategory ((char) i) ==
1515 UnicodeCategory.OtherNotAssigned)
1517 if (IsIgnorable (i))
1518 ignorableFlags [i] |= 1;
1519 if (IsIgnorableSymbol (i))
1520 ignorableFlags [i] |= 2;
1521 if (IsIgnorableNonSpacing (i))
1522 ignorableFlags [i] |= 4;
1526 void ModifyParsedValues ()
1528 // number, secondary weights
1530 int [] numarr = numberSecondaryWeightBounds;
1531 for (int i = 0; i < numarr.Length; i += 2, weight++)
1532 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1533 if (Char.IsNumber ((char) cp))
1534 diacritical [cp] = weight;
1536 // Modify some decomposition equivalence
1537 decompType [0xFE31] = 0;
1538 decompIndex [0xFE31] = 0;
1539 decompLength [0xFE31] = 0;
1540 decompType [0xFE32] = 0;
1541 decompIndex [0xFE32] = 0;
1542 decompLength [0xFE32] = 0;
1544 // Korean parens numbers
1545 for (int i = 0x3200; i <= 0x321C; i++)
1546 diacritical [i] = 0xA;
1547 for (int i = 0x3260; i <= 0x327B; i++)
1548 diacritical [i] = 0xC;
1550 // Update name part of named characters
1551 for (int i = 0; i < sortableCharNames.Count; i++) {
1552 DictionaryEntry de =
1553 (DictionaryEntry) sortableCharNames [i];
1554 int cp = (int) de.Key;
1555 string renamed = null;
1557 case 0x2101: renamed = "A_1"; break;
1558 case 0x33C3: renamed = "A_2"; break;
1559 case 0x2105: renamed = "C_1"; break;
1560 case 0x2106: renamed = "C_2"; break;
1561 case 0x211E: renamed = "R1"; break;
1562 case 0x211F: renamed = "R2"; break;
1563 // Remove some of them!
1574 sortableCharNames.RemoveAt (i);
1578 if (renamed != null)
1579 sortableCharNames [i] =
1580 new DictionaryEntry (cp, renamed);
1584 void GenerateCore ()
1588 #region Specially ignored // 01
1589 // This will raise "Defined" flag up.
1590 foreach (char c in specialIgnore)
1591 map [(int) c] = new CharMapEntry (0, 0, 0);
1595 #region Variable weights
1596 // Controls : 06 03 - 06 3D
1598 for (int i = 0; i < 65536; i++) {
1599 if (IsIgnorable (i))
1602 uc = Char.GetUnicodeCategory (c);
1603 // NEL is whitespace but not ignored here.
1604 if (uc == UnicodeCategory.Control &&
1605 !Char.IsWhiteSpace (c) || c == '\u0085')
1606 AddCharMap (c, 6, 1);
1610 fillIndex [6] = 0x80;
1611 AddCharMapGroup ('\'', 6, 1, 0);
1612 AddCharMap ('\uFE63', 6, 1);
1614 // Hyphen/Dash : 06 81 - 06 90
1615 for (int i = 0; i < char.MaxValue; i++) {
1616 if (!IsIgnorable (i) &&
1617 Char.GetUnicodeCategory ((char) i) ==
1618 UnicodeCategory.DashPunctuation) {
1619 AddCharMapGroup2 ((char) i, 6, 1, 0);
1621 // SPECIAL: add 2027 and 2043
1622 // Maybe they are regarded the
1623 // same hyphens in "central"
1625 AddCharMap ('\u2027', 6, 1);
1626 AddCharMap ('\u2043', 6, 1);
1631 // Arabic variable weight chars 06 A0 -
1632 fillIndex [6] = 0xA0;
1634 for (int i = 0x64B; i <= 0x650; i++)
1635 AddArabicCharMap ((char) i);
1637 AddCharMapGroup ('\u0652', 6, 1, 0);
1639 AddCharMapGroup ('\u0651', 6, 1, 0);
1643 #region Nonspacing marks // 01
1644 // FIXME: 01 03 - 01 B6 ... annoyance :(
1646 // Combining diacritical marks: 01 DC -
1648 fillIndex [0x1] = 0x41;
1649 for (int i = 0x030E; i <= 0x0326; i++)
1650 if (!IsIgnorable (i))
1651 AddCharMap ((char) i, 0x1, 1);
1652 for (int i = 0x0329; i <= 0x0334; i++)
1653 if (!IsIgnorable (i))
1654 AddCharMap ((char) i, 0x1, 1);
1655 for (int i = 0x0339; i <= 0x0341; i++)
1656 if (!IsIgnorable (i))
1657 AddCharMap ((char) i, 0x1, 1);
1658 fillIndex [0x1] = 0x72;
1659 for (int i = 0x0346; i <= 0x0348; i++)
1660 if (!IsIgnorable (i))
1661 AddCharMap ((char) i, 0x1, 1);
1662 for (int i = 0x02BE; i <= 0x02BF; i++)
1663 if (!IsIgnorable (i))
1664 AddCharMap ((char) i, 0x1, 1);
1665 for (int i = 0x02C1; i <= 0x02C5; i++)
1666 if (!IsIgnorable (i))
1667 AddCharMap ((char) i, 0x1, 1);
1668 for (int i = 0x02CE; i <= 0x02CF; i++)
1669 if (!IsIgnorable (i))
1670 AddCharMap ((char) i, 0x1, 1);
1671 for (int i = 0x02D1; i <= 0x02D3; i++)
1672 if (!IsIgnorable (i))
1673 AddCharMap ((char) i, 0x1, 1);
1674 AddCharMap ('\u02DE', 0x1, 1);
1675 for (int i = 0x02E4; i <= 0x02E9; i++)
1676 if (!IsIgnorable (i))
1677 AddCharMap ((char) i, 0x1, 1);
1679 // FIXME: needs more love here (it should eliminate
1680 // all the hacky code above).
1681 for (int i = 0x0300; i < 0x0370; i++)
1682 if (!IsIgnorable (i) && diacritical [i] != 0
1683 /* especiall here*/ && !map [i].Defined)
1684 map [i] = new CharMapEntry (
1685 0x1, 0x1, diacritical [i]);
1687 fillIndex [0x1] = 0xAC;
1688 for (int i = 0x07A6; i <= 0x07B0; i++)
1689 if (!IsIgnorable (i))
1690 AddCharMap ((char) i, 0x1, 1);
1692 fillIndex [0x1] = 0x0C;
1693 for (int i = 0x0EC8; i <= 0x0ECD; i++)
1694 if (!IsIgnorable (i))
1695 AddCharMap ((char) i, 0x1, 1);
1697 // LAMESPEC: It should not stop at '\u20E1'. There are
1698 // a few more characters (that however results in
1699 // overflow of level 2 unless we start before 0xDD).
1700 fillIndex [0x1] = 0xDC;
1701 for (int i = 0x20d0; i <= 0x20e1; i++)
1702 AddCharMap ((char) i, 0x1, 1);
1706 #region Whitespaces // 07 03 -
1707 fillIndex [0x7] = 0x2;
1708 AddCharMap (' ', 0x7, 2);
1709 AddCharMap ('\u00A0', 0x7, 1);
1710 for (int i = 9; i <= 0xD; i++)
1711 AddCharMap ((char) i, 0x7, 1);
1712 for (int i = 0x2000; i <= 0x200B; i++)
1713 AddCharMap ((char) i, 0x7, 1);
1715 fillIndex [0x7] = 0x17;
1716 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1717 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1719 // Characters which used to represent layout control.
1720 // LAMESPEC: Windows developers seem to have thought
1721 // that those characters are kind of whitespaces,
1722 // while they aren't.
1723 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1724 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1727 // category 09 - continued symbols from 08
1728 fillIndex [0x9] = 2;
1730 for (int cp = 0x2300; cp <= 0x237A; cp++)
1731 AddCharMap ((char) cp, 0x9, 1, 0);
1734 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1735 foreach (DictionaryEntry de in arrowValues) {
1736 int idx = (int) de.Value;
1737 int cp = (int) de.Key;
1738 if (map [cp].Defined)
1740 fillIndex [0x9] = (byte) (0xD8 + idx);
1741 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1745 byte [] boxLv2 = new byte [128];
1746 for (int i = 0; i < boxLv2.Length; i++)
1748 foreach (DictionaryEntry de in boxValues) {
1749 int cp = (int) de.Key;
1750 int off = (int) de.Value;
1751 if (map [cp].Defined)
1754 fillIndex [0x9] = (byte) (0xE5 + off);
1755 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1758 fillIndex [0x9] = (byte) (0xE5 + off);
1759 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1762 // Some special characters (slanted)
1763 fillIndex [0x9] = 0xF4;
1764 AddCharMap ('\u2571', 0x9, 3);
1765 AddCharMap ('\u2572', 0x9, 3);
1766 AddCharMap ('\u2573', 0x9, 3);
1768 // FIXME: implement 0A
1770 fillIndex [0xA] = 2;
1771 // byte currency symbols
1772 for (int cp = 0; cp < 0x100; cp++) {
1773 uc = Char.GetUnicodeCategory ((char) cp);
1774 if (!IsIgnorable (cp) &&
1775 uc == UnicodeCategory.CurrencySymbol &&
1778 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1780 // byte other symbols
1781 for (int cp = 0; cp < 0x100; cp++) {
1783 continue; // SPECIAL: skip FIXME: why?
1784 uc = Char.GetUnicodeCategory ((char) cp);
1785 if (!IsIgnorable (cp) &&
1786 uc == UnicodeCategory.OtherSymbol)
1787 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1790 fillIndex [0xA] = 0x1C; // FIXME: it won't be needed
1791 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1792 AddCharMap ((char) cp, 0xA, 1, 0);
1793 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1794 for (int cp = 0x2600; cp <= 0x2613; cp++)
1795 AddCharMap ((char) cp, 0xA, 1, 0);
1797 for (int cp = 0x2620; cp <= 0x2770; cp++)
1798 if (Char.IsSymbol ((char) cp))
1799 AddCharMap ((char) cp, 0xA, 1, 0);
1801 for (int i = 0x2440; i < 0x2460; i++)
1802 AddCharMap ((char) i, 0xA, 1, 0);
1806 #region Numbers // 0C 02 - 0C E1
1807 fillIndex [0xC] = 2;
1809 // 9F8 : Bengali "one less than the denominator"
1810 AddCharMap ('\u09F8', 0xC, 1);
1812 ArrayList numbers = new ArrayList ();
1813 for (int i = 0; i < 65536; i++)
1814 if (!IsIgnorable (i) &&
1815 Char.IsNumber ((char) i) &&
1816 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1819 ArrayList numberValues = new ArrayList ();
1820 foreach (int i in numbers)
1821 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1822 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1824 //foreach (DictionaryEntry de in numberValues)
1825 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1827 decimal prevValue = -1;
1828 foreach (DictionaryEntry de in numberValues) {
1829 int cp = (int) de.Key;
1830 decimal currValue = (decimal) de.Value;
1831 bool addnew = false;
1832 if (prevValue < currValue &&
1833 prevValue - (int) prevValue == 0 &&
1837 // Process Hangzhou and Roman numbers
1839 // There are some SPECIAL cases.
1840 if (currValue != 4) // no increment for 4
1844 if (currValue <= 10) {
1845 xcp = (int) prevValue + 0x2170 - 1;
1846 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1847 xcp = (int) prevValue + 0x2160 - 1;
1848 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1849 fillIndex [0xC] += 2;
1850 xcp = (int) prevValue + 0x3021 - 1;
1851 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1854 else if (currValue == 11)
1857 if (prevValue < currValue)
1858 prevValue = currValue;
1859 if (map [cp].Defined)
1861 // HangZhou and Roman are add later
1863 else if (0x3021 <= cp && cp < 0x302A
1864 || 0x2160 <= cp && cp < 0x216A
1865 || 0x2170 <= cp && cp < 0x217A)
1868 if (cp == 0x215B) // FIXME: why?
1869 fillIndex [0xC] += 2;
1870 else if (cp == 0x3021) // FIXME: why?
1872 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1873 if (addnew || cp <= '9') {
1874 int mod = (int) currValue - 1;
1876 if (1 <= currValue && currValue <= 10) {
1878 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1880 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1882 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1884 if (1 <= currValue && currValue <= 20) {
1886 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1888 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1890 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1894 if (cp != 0x09E7 && cp != 0x09EA)
1897 // Add special cases that are not regarded as
1898 // numbers in UnicodeCategory speak.
1901 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1902 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1904 else if (cp == '6') // FIXME: why?
1909 fillIndex [0xC] = 0xFF;
1910 AddCharMap ('\u221E', 0xC, 1);
1913 #region Letters and NonSpacing Marks (general)
1915 // ASCII Latin alphabets
1916 for (int i = 0; i < alphabets.Length; i++)
1917 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1920 // non-ASCII Latin alphabets
1921 // FIXME: there is no such characters that are placed
1922 // *after* "alphabets" array items. This is nothing
1923 // more than a hack that creates dummy weight for
1924 // primary characters.
1925 for (int i = 0x0080; i < 0x0300; i++) {
1926 if (!Char.IsLetter ((char) i))
1928 // For those Latin Letters which has NFKD are
1929 // not added as independent primary character.
1930 if (decompIndex [i] != 0)
1933 // 1.some alphabets have primarily
1934 // equivalent ASCII alphabets.
1935 // 2.some have independent primary weights,
1936 // but inside a-to-z range.
1937 // 3.there are some expanded characters that
1938 // are not part of Unicode Standard NFKD.
1939 // 4. some characters are letter in IsLetter
1940 // but not in sortkeys (maybe unicode version
1941 // difference caused it).
1943 // 1. skipping them does not make sense
1944 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1945 // case 0x184: case 0x185: case 0x186: case 0x189:
1946 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1947 // case 0x194: case 0x195: case 0x196: case 0x19A:
1948 // case 0x19B: case 0x19C:
1949 // 2. skipping them does not make sense
1950 // case 0x14A: // Ng
1951 // case 0x14B: // ng
1955 case 0xDE: // Icelandic Thorn
1956 case 0xFE: // Icelandic Thorn
1957 case 0xDF: // German ss
1958 case 0xFF: // German ss
1960 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1961 // not classified yet
1962 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1963 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1964 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1968 AddCharMapGroup ((char) i, 0xE, 1, 0);
1972 fillIndex [0xF] = 02;
1973 for (int i = 0x0380; i < 0x0390; i++)
1974 if (Char.IsLetter ((char) i))
1975 AddLetterMap ((char) i, 0xF, 1);
1976 fillIndex [0xF] = 02;
1977 for (int i = 0x0391; i < 0x03CF; i++)
1978 if (Char.IsLetter ((char) i))
1979 AddLetterMap ((char) i, 0xF, 1);
1980 fillIndex [0xF] = 0x40;
1981 for (int i = 0x03D0; i < 0x0400; i++)
1982 if (Char.IsLetter ((char) i))
1983 AddLetterMap ((char) i, 0xF, 1);
1985 // Cyrillic - character name order
1986 fillIndex [0x10] = 0x6;
1988 for (int i = 0; i < orderedCyrillic.Length; i++)
1989 Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
1991 // table which is moslty from UCA DUCET.
1992 for (int i = 0; i < orderedCyrillic.Length; i++) {
1993 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
1994 if (!IsIgnorable ((int) c) &&
1996 Char.IsLetter (c)) {
1997 AddLetterMap (c, 0x10, 0);
1998 fillIndex [0x10] += 3;
2002 for (int i = 0x0460; i < 0x0481; i++) {
2003 if (Char.IsLetter ((char) i)) {
2004 AddLetterMap ((char) i, 0x10, 0);
2005 fillIndex [0x10] += 3;
2010 for (int i = 0x0400; i <= 0x0486; i++) {
2011 if (!Char.IsLetter ((char) i)) {
2012 // AddCharMap ((char) i, 0x1, 1);
2015 if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
2016 Console.Error.WriteLine ("no value for {0:x04}", i);
2020 (byte) cyrillicLetterPrimaryValues [i];
2021 AddLetterMap ((char) i, 0x10, 0);
2026 fillIndex [0x11] = 0x3;
2027 for (int i = 0x0531; i < 0x0586; i++)
2028 if (Char.IsLetter ((char) i))
2029 AddLetterMap ((char) i, 0x11, 1);
2033 fillIndex [0x12] = 0x3;
2034 for (int i = 0x05D0; i < 0x05FF; i++)
2035 if (Char.IsLetter ((char) i))
2036 AddLetterMap ((char) i, 0x12, 1);
2038 fillIndex [0x1] = 0x3;
2039 for (int i = 0x0591; i <= 0x05C2; i++)
2041 AddCharMap ((char) i, 0x1, 1);
2044 fillIndex [0x1] = 0x8E;
2045 fillIndex [0x13] = 0x3;
2046 for (int i = 0x0621; i <= 0x064A; i++) {
2048 if (Char.GetUnicodeCategory ((char) i)
2049 != UnicodeCategory.OtherLetter) {
2050 // FIXME: arabic nonspacing marks are
2051 // in different order.
2052 AddCharMap ((char) i, 0x1, 1);
2055 // map [i] = new CharMapEntry (0x13,
2056 // (byte) arabicLetterPrimaryValues [i], 1);
2058 (byte) arabicLetterPrimaryValues [i];
2059 AddLetterMap ((char) i, 0x13, 0);
2061 fillIndex [0x13] = 0x84;
2062 for (int i = 0x0674; i < 0x06D6; i++)
2063 if (Char.IsLetter ((char) i))
2064 AddLetterMap ((char) i, 0x13, 1);
2067 // FIXME: it does seem straight codepoint mapping.
2068 fillIndex [0x14] = 04;
2069 for (int i = 0x0901; i < 0x0905; i++)
2070 if (!IsIgnorable (i))
2071 AddLetterMap ((char) i, 0x14, 2);
2072 fillIndex [0x14] = 0xB;
2073 for (int i = 0x0905; i < 0x093A; i++) {
2075 AddCharMap ('\u0929', 0x14, 0, 8);
2077 AddCharMap ('\u0931', 0x14, 0, 8);
2079 AddCharMap ('\u0934', 0x14, 0, 8);
2080 if (Char.IsLetter ((char) i))
2081 AddLetterMap ((char) i, 0x14, 4);
2083 AddCharMap ('\u0960', 0x14, 4);
2085 AddCharMap ('\u0961', 0x14, 4);
2087 fillIndex [0x14] = 0xDA;
2088 for (int i = 0x093E; i < 0x0945; i++)
2089 if (!IsIgnorable (i))
2090 AddLetterMap ((char) i, 0x14, 2);
2091 fillIndex [0x14] = 0xEC;
2092 for (int i = 0x0945; i < 0x094F; i++)
2093 if (!IsIgnorable (i))
2094 AddLetterMap ((char) i, 0x14, 2);
2098 fillIndex [0x15] = 02;
2099 for (int i = 0x0980; i < 0x9FF; i++) {
2100 if (IsIgnorable (i))
2103 fillIndex [0x15] = 0x3B;
2104 switch (Char.GetUnicodeCategory ((char) i)) {
2105 case UnicodeCategory.NonSpacingMark:
2106 case UnicodeCategory.DecimalDigitNumber:
2107 case UnicodeCategory.OtherNumber:
2110 AddLetterMap ((char) i, 0x15, 1);
2113 fillIndex [0x1] = 0x3;
2114 for (int i = 0x0981; i < 0x0A00; i++)
2115 if (Char.GetUnicodeCategory ((char) i) ==
2116 UnicodeCategory.NonSpacingMark)
2117 AddCharMap ((char) i, 0x1, 1);
2119 // Gurmukhi. orderedGurmukhi is from UCA
2120 // FIXME: it does not look equivalent to UCA.
2121 fillIndex [0x16] = 04;
2122 fillIndex [0x1] = 3;
2123 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2124 char c = orderedGurmukhi [i];
2125 if (IsIgnorable ((int) c))
2127 if (IsIgnorableNonSpacing (c)) {
2128 AddLetterMap (c, 0x1, 1);
2131 if (c == '\u0A3C' || c == '\u0A4D' ||
2132 '\u0A66' <= c && c <= '\u0A71')
2134 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2136 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2138 AddLetterMap (c, 0x16, shift);
2141 // Gujarati. orderedGujarati is from UCA
2142 fillIndex [0x17] = 0x4;
2144 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2145 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2146 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2147 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2148 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2149 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2150 // letters go first.
2151 for (int i = 0; i < orderedGujarati.Length; i++) {
2153 char c = orderedGujarati [i];
2154 if (Char.IsLetter (c)) {
2156 if (c == '\u0AB3' || c == '\u0A32')
2158 if (c == '\u0A33') {
2159 AddCharMap ('\u0A32', 0x17, 0);
2160 AddCharMap ('\u0A33', 0x17, 4, 4);
2164 AddCharMap ('\u0AE0', 0x17, 0, 5);
2165 AddCharMap (c, 0x17, 4);
2168 AddCharMap ('\u0AB3', 0x17, 6);
2172 byte gujaratiShift = 4;
2173 fillIndex [0x17] = 0xC0;
2174 for (int i = 0; i < orderedGujarati.Length; i++) {
2175 char c = orderedGujarati [i];
2176 if (fillIndex [0x17] == 0xCC)
2178 if (!Char.IsLetter (c)) {
2181 AddCharMap ('\u0A81', 0x17, 2);
2184 AddLetterMap (c, 0x17, gujaratiShift);
2189 fillIndex [0x1] = 03;
2190 fillIndex [0x18] = 02;
2191 for (int i = 0x0B00; i < 0x0B7F; i++) {
2192 switch (Char.GetUnicodeCategory ((char) i)) {
2193 case UnicodeCategory.NonSpacingMark:
2194 case UnicodeCategory.DecimalDigitNumber:
2195 AddLetterMap ((char) i, 0x1, 1);
2198 AddLetterMap ((char) i, 0x18, 1);
2202 fillIndex [0x19] = 2;
2203 AddCharMap ('\u0BD7', 0x19, 0);
2204 fillIndex [0x19] = 0xA;
2206 for (int i = 0x0B82; i <= 0x0B94; i++)
2207 if (!IsIgnorable ((char) i))
2208 AddCharMap ((char) i, 0x19, 2);
2210 fillIndex [0x19] = 0x28;
2211 // The array for Tamil consonants is a constant.
2212 // Windows have almost similar sequence to TAM from
2213 // tamilnet but a bit different in Grantha.
2214 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2215 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2217 fillIndex [0x19] = 0x82;
2218 for (int i = 0x0BBE; i < 0x0BCD; i++)
2219 if (Char.GetUnicodeCategory ((char) i) ==
2220 UnicodeCategory.SpacingCombiningMark
2222 AddLetterMap ((char) i, 0x19, 2);
2225 fillIndex [0x1A] = 0x4;
2226 for (int i = 0x0C00; i < 0x0C62; i++) {
2227 if (i == 0x0C55 || i == 0x0C56)
2229 AddCharMap ((char) i, 0x1A, 3);
2230 char supp = (i == 0x0C0B) ? '\u0C60':
2231 i == 0x0C0C ? '\u0C61' : char.MinValue;
2232 if (supp == char.MinValue)
2234 AddCharMap (supp, 0x1A, 3);
2238 fillIndex [0x1B] = 4;
2239 for (int i = 0x0C80; i < 0x0CE5; i++) {
2240 if (i == 0x0CD5 || i == 0x0CD6)
2242 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2243 continue; // shift after 0xCB9
2244 AddCharMap ((char) i, 0x1B, 3);
2246 // SPECIAL CASES: but why?
2247 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2248 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2249 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2252 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2256 fillIndex [0x1C] = 2;
2257 for (int i = 0x0D02; i < 0x0D61; i++)
2258 // FIXME: I avoided MSCompatUnicodeTable usage
2259 // here (it results in recursion). So check if
2260 // using NonSpacingMark makes sense or not.
2261 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2262 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2263 AddCharMap ((char) i, 0x1C, 1);
2265 // Thai ... note that it breaks 0x1E wall after E2B!
2266 // Also, all Thai characters have level 2 value 3.
2267 fillIndex [0x1E] = 2;
2268 for (int i = 0xE40; i <= 0xE44; i++)
2269 AddCharMap ((char) i, 0x1E, 1, 3);
2270 for (int i = 0xE01; i < 0xE2B; i++)
2271 AddCharMap ((char) i, 0x1E, 6, 3);
2272 fillIndex [0x1F] = 5;
2273 for (int i = 0xE2B; i < 0xE30; i++)
2274 AddCharMap ((char) i, 0x1F, 6, 3);
2275 fillIndex [0x1F] = 0x1E;
2276 for (int i = 0xE30; i < 0xE3B; i++)
2277 AddCharMap ((char) i, 0x1F, 1, 3);
2278 // some Thai characters remains.
2279 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2280 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2281 foreach (char c in specialThai)
2282 AddCharMap (c, 0x1F, 1);
2285 fillIndex [0x1F] = 2;
2286 for (int i = 0xE80; i < 0xEDF; i++)
2287 if (Char.IsLetter ((char) i))
2288 AddCharMap ((char) i, 0x1F, 1);
2290 // Georgian. orderedGeorgian is from UCA DUCET.
2291 fillIndex [0x21] = 5;
2292 for (int i = 0; i < orderedGeorgian.Length; i++) {
2293 char c = orderedGeorgian [i];
2294 if (map [(int) c].Defined)
2296 AddCharMap (c, 0x21, 0);
2298 AddCharMap ((char) (c - 0x30), 0x21, 0);
2299 fillIndex [0x21] += 5;
2303 fillIndex [0x22] = 2;
2304 int kanaOffset = 0x3041;
2305 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2307 for (int gyo = 0; gyo < 9; gyo++) {
2308 for (int dan = 0; dan < 5; dan++) {
2309 if (gyo == 7 && dan % 2 == 1) {
2312 kanaOffset -= 2; // There is no space for yi and ye.
2315 int cp = kanaOffset + dan * kanaLines [gyo];
2316 // small lines (a-gyo, ya-gyo)
2317 if (gyo == 0 || gyo == 7) {
2318 AddKanaMap (cp, 1); // small
2319 AddKanaMap (cp + 1, 1);
2322 AddKanaMap (cp, kanaLines [gyo]);
2326 // add small 'ka' (before normal one)
2327 AddKanaMap (0x30F5, 1);
2331 // add small 'ke' (before normal one)
2332 AddKanaMap (0x30F6, 1);
2336 // add small 'Tsu' (before normal one)
2337 AddKanaMap (0x3063, 1);
2341 fillIndex [0x22] += 3;
2342 kanaOffset += 5 * kanaLines [gyo];
2345 // Wa-gyo is almost special, so I just manually add.
2346 AddLetterMap ((char) 0x308E, 0x22, 0);
2347 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2348 AddLetterMap ((char) 0x308F, 0x22, 0);
2349 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2351 AddLetterMap ((char) 0x3090, 0x22, 0);
2352 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2353 fillIndex [0x22] += 2;
2354 // no "Wu" in Japanese.
2355 AddLetterMap ((char) 0x3091, 0x22, 0);
2356 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2358 AddLetterMap ((char) 0x3092, 0x22, 0);
2359 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2361 fillIndex [0x22] = 0x80;
2362 AddLetterMap ((char) 0x3093, 0x22, 0);
2363 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2365 // JIS Japanese square chars.
2366 fillIndex [0x22] = 0x97;
2367 jisJapanese.Sort (JISComparer.Instance);
2368 foreach (JISCharacter j in jisJapanese)
2369 if (0x3300 <= j.CP && j.CP <= 0x3357)
2370 AddCharMap ((char) j.CP, 0x22, 1);
2371 // non-JIS Japanese square chars.
2372 nonJisJapanese.Sort (NonJISComparer.Instance);
2373 foreach (NonJISCharacter j in nonJisJapanese)
2374 AddCharMap ((char) j.CP, 0x22, 1);
2377 fillIndex [0x23] = 0x02;
2378 for (int i = 0x3105; i <= 0x312C; i++)
2379 AddCharMap ((char) i, 0x23, 1);
2381 // Estrangela: ancient Syriac
2382 fillIndex [0x24] = 0x0B;
2383 // FIXME: is 0x71E really alternative form?
2384 ArrayList syriacAlternatives = new ArrayList (
2385 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2386 for (int i = 0x0710; i <= 0x072C; i++) {
2387 if (i == 0x0711) // NonSpacingMark
2389 if (syriacAlternatives.Contains (i))
2391 AddCharMap ((char) i, 0x24, 4);
2396 foreach (int cp in syriacAlternatives)
2397 map [cp] = new CharMapEntry (0x24,
2398 (byte) (map [cp - 1].Level1 + 2),
2400 // FIXME: Syriac NonSpacingMark should go here.
2403 // FIXME: it turned out that it does not look like UCA
2404 fillIndex [0x24] = 0x6E;
2405 for (int i = 0; i < orderedThaana.Length; i++) {
2406 char c = orderedThaana [i];
2407 if (IsIgnorableNonSpacing ((int) c))
2409 AddCharMap (c, 0x24, 2);
2410 if (c == '\u0782') // SPECIAL CASE: why?
2411 fillIndex [0x24] += 2;
2415 // FIXME: Add more culture-specific letters (that are
2416 // not supported in Windows collation) here.
2418 // Surrogate ... they are computed.
2423 // Unlike UCA Windows Hangul sequence mixes Jongseong
2424 // with Choseong sequence as well as Jungseong,
2425 // adjusted to have the same primary weight for the
2426 // same base character. So it is impossible to compute
2429 // Here I introduce an ordered sequence of mixed
2430 // 'commands' and 'characters' that is similar to
2432 // - ',' increases primary weight.
2433 // - [A B] means a range, increasing index
2434 // - {A B} means a range, without increasing index
2435 // - '=' is no operation (it means the characters
2436 // of both sides have the same weight).
2437 // - '>' inserts a Hangul Syllable block that
2438 // contains 0x251 characters.
2439 // - '<' decreases the index
2440 // - '0'-'9' means skip count
2441 // - whitespaces are ignored
2444 string hangulSequence =
2445 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2446 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2447 + "<{\u1113 \u1116}, \u3165,"
2448 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2449 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2450 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2451 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2452 + "[\u11D1 \u11D2], \u11B2,"
2453 + "[\u11D3 \u11D5], \u11B3,"
2454 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2455 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2456 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2457 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2458 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2459 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2460 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2461 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2462 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2463 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2464 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2465 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2466 + "\u11F1,, \u11F2,,,"
2467 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2468 + "<\u114D, \u110D,, >"
2469 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2470 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2471 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2472 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2473 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2477 byte hangulCat = 0x52;
2478 fillIndex [hangulCat] = 0x2;
2480 int syllableBlock = 0;
2481 for (int n = 0; n < hangulSequence.Length; n++) {
2482 char c = hangulSequence [n];
2484 if (Char.IsWhiteSpace (c))
2490 IncrementSequentialIndex (ref hangulCat);
2493 if (fillIndex [hangulCat] == 2)
2494 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2495 fillIndex [hangulCat]--;
2498 IncrementSequentialIndex (ref hangulCat);
2499 for (int l = 0; l < 0x15; l++)
2500 for (int v = 0; v < 0x1C; v++) {
2502 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2503 IncrementSequentialIndex (ref hangulCat);
2508 start = hangulSequence [n + 1];
2509 end = hangulSequence [n + 3];
2510 for (int i = start; i <= end; i++) {
2511 AddCharMap ((char) i, hangulCat, 0);
2513 IncrementSequentialIndex (ref hangulCat);
2515 n += 4; // consumes 5 characters for this operation
2518 start = hangulSequence [n + 1];
2519 end = hangulSequence [n + 3];
2520 for (int i = start; i <= end; i++)
2521 AddCharMap ((char) i, hangulCat, 0);
2522 n += 4; // consumes 5 characters for this operation
2525 AddCharMap (c, hangulCat, 0);
2531 for (int i = 0x3200; i < 0x3300; i++) {
2532 if (IsIgnorable (i) || map [i].Defined)
2536 if (decompLength [i] == 4 &&
2537 decompValues [decompIndex [i]] == '(')
2538 ch = decompIndex [i] + 1;
2540 else if (decompLength [i] == 2 &&
2541 decompValues [decompIndex [i] + 1] == '\u1161')
2542 ch = decompIndex [i];
2543 else if (decompLength [i] == 1)
2544 ch = decompIndex [i];
2547 ch = decompValues [ch];
2548 if (ch < 0x1100 || 0x1200 < ch &&
2549 ch < 0xAC00 || 0xD800 < ch)
2553 int offset = i < 0x3260 ? 1 : 0;
2554 if (0x326E <= i && i <= 0x3273)
2557 map [i] = new CharMapEntry (map [ch].Category,
2558 (byte) (map [ch].Level1 + offset),
2560 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2566 // Letterlike characters and CJK compatibility square
2567 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2568 int [] counts = new int ['Z' - 'A' + 1];
2569 char [] namedChars = new char [sortableCharNames.Count];
2571 foreach (DictionaryEntry de in sortableCharNames) {
2572 counts [((string) de.Value) [0] - 'A']++;
2573 namedChars [nCharNames++] = (char) ((int) de.Key);
2575 nCharNames = 0; // reset
2576 for (int a = 0; a < counts.Length; a++) {
2577 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2578 for (int i = 0; i < counts [a]; i++)
2579 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2580 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2583 // CJK unified ideograph.
2585 fillIndex [cjkCat] = 0x2;
2586 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2587 if (!IsIgnorable (cp))
2588 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2589 // CJK Extensions goes here.
2590 // LAMESPEC: With this Windows style CJK layout, it is
2591 // impossible to add more CJK ideograph i.e. 0x9FA6-
2592 // 0x9FBB can never be added w/o breaking compat.
2593 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2594 if (!IsIgnorable (cp))
2595 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2597 // PrivateUse ... computed.
2598 // remaining Surrogate ... computed.
2600 #region Special "biggest" area (FF FF)
2601 fillIndex [0xFF] = 0xFF;
2602 char [] specialBiggest = new char [] {
2603 '\u3005', '\u3031', '\u3032', '\u309D',
2604 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2605 '\uFE7C', '\uFE7D', '\uFF70'};
2606 foreach (char c in specialBiggest)
2607 AddCharMap (c, 0xFF, 0);
2610 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2611 // non-alphanumeric ASCII except for: + - < = > '
2612 for (int i = 0x21; i < 0x7F; i++) {
2613 if (Char.IsLetterOrDigit ((char) i)
2614 || "+-<=>'".IndexOf ((char) i) >= 0)
2615 continue; // they are not added here.
2616 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2617 // Insert 3001 after ',' and 3002 after '.'
2619 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2620 else if (i == 0x2E) {
2622 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2625 AddCharMap ('\uFE30', 0x7, 1, 0);
2629 #region 07 - Punctuations and something else
2630 for (int i = 0xA0; i < char.MaxValue; i++) {
2631 if (IsIgnorable (i))
2634 // FIXME: actually those reset should not be
2635 // done but here I put for easy goal.
2637 fillIndex [0x7] = 0xE2;
2639 fillIndex [0x7] = 0x77;
2651 switch (Char.GetUnicodeCategory ((char) i)) {
2652 case UnicodeCategory.OtherPunctuation:
2653 case UnicodeCategory.ClosePunctuation:
2654 case UnicodeCategory.OpenPunctuation:
2655 case UnicodeCategory.InitialQuotePunctuation:
2656 case UnicodeCategory.FinalQuotePunctuation:
2657 case UnicodeCategory.ModifierSymbol:
2658 // SPECIAL CASES: // 0xA
2659 if (0x2020 <= i && i <= 0x2042)
2661 AddCharMapGroup ((char) i, 0x7, 1, 0);
2664 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2665 goto case UnicodeCategory.OtherPunctuation;
2670 // FIXME: it should not need to reset level 1, but
2671 // it's for easy goal.
2672 fillIndex [0x7] = 0xB6;
2673 for (int i = 0x2400; i <= 0x2421; i++)
2674 AddCharMap ((char) i, 0x7, 1, 0);
2677 // FIXME: for 07 xx we need more love.
2679 // Characters w/ diacritical marks (NFKD)
2680 for (int i = 0; i <= char.MaxValue; i++) {
2681 if (map [i].Defined || IsIgnorable (i))
2683 if (decompIndex [i] == 0)
2686 int start = decompIndex [i];
2687 int primaryChar = decompValues [start];
2690 int length = decompLength [i];
2691 // special processing for parenthesized ones.
2693 decompValues [start] == '(' &&
2694 decompValues [start + 2] == ')') {
2695 primaryChar = decompValues [start + 1];
2699 if (map [primaryChar].Level1 == 0)
2702 for (int l = 1; l < length; l++) {
2703 int c = decompValues [start + l];
2704 if (map [c].Level1 != 0)
2706 secondary += diacritical [c];
2710 map [i] = new CharMapEntry (
2711 map [primaryChar].Category,
2712 map [primaryChar].Level1,
2717 // category 08 - symbols
2718 fillIndex [0x8] = 2;
2719 // Here Windows mapping is not straightforward. It is
2720 // not based on computation but seems manual sorting.
2721 AddCharMapGroup ('+', 0x8, 1, 0); // plus
2722 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2723 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2724 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2725 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2726 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2727 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2728 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2729 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2730 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2731 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2732 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2733 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2735 for (int cp = 0; cp < 0x2300; cp++) {
2736 if (cp == 0xAC) // SPECIAL CASE: skip
2739 cp = 0x2200; // skip to 2200
2740 fillIndex [0x8] = 0x21;
2743 fillIndex [0x8] = 0x3;
2744 if (!map [cp].Defined &&
2745 // Char.GetUnicodeCategory ((char) cp) ==
2746 // UnicodeCategory.MathSymbol)
2747 Char.IsSymbol ((char) cp))
2748 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2749 // SPECIAL CASES: no idea why Windows sorts as such
2752 AddCharMap ('\u227B', 0x8, 1, 0);
2753 AddCharMap ('\u22B1', 0x8, 1, 0);
2756 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2757 AddCharMapGroup ('\u226A', 0x8, 1, 0);
2758 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2759 AddCharMapGroup ('\u226B', 0x8, 1, 0);
2762 AddCharMap ('\u01C0', 0x8, 1, 0);
2763 AddCharMap ('\u01C1', 0x8, 1, 0);
2764 AddCharMap ('\u01C2', 0x8, 1, 0);
2769 #region Level2 adjustment
2771 diacritical [0x624] = 0x5;
2772 diacritical [0x626] = 0x7;
2773 diacritical [0x622] = 0x9;
2774 diacritical [0x623] = 0xA;
2775 diacritical [0x625] = 0xB;
2776 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2777 diacritical [0x64A] = 0x7; // Yaa'
2779 for (int i = 0; i < char.MaxValue; i++) {
2781 byte cat = map [i].Category;
2783 case 0xE: // Latin diacritics
2784 case 0x22: // Japanese: circled characters
2785 mod = diacritical [i];
2787 case 0x13: // Arabic
2788 if (diacritical [i] == 0 && i >= 0xFE8D)
2789 mod = 0x8; // default for arabic
2792 if (0x52 <= cat && cat <= 0x7F) // Hangul
2793 mod = diacritical [i];
2795 map [i] = new CharMapEntry (
2796 cat, map [i].Level1, mod);
2800 // FIXME: this is hack but those NonSpacingMark
2801 // characters and still undefined are likely to
2803 for (int i = 0; i < char.MaxValue; i++)
2804 if (!map [i].Defined &&
2806 Char.GetUnicodeCategory ((char) i) ==
2807 UnicodeCategory.NonSpacingMark)
2808 AddCharMap ((char) i, 1, 1);
2810 // FIXME: this is hack but those Symbol characters
2811 // are likely to fall into 0xA category.
2812 for (int i = 0; i < char.MaxValue; i++)
2813 if (!map [i].Defined &&
2815 Char.IsSymbol ((char) i))
2816 AddCharMap ((char) i, 0xA, 1);
2819 private void IncrementSequentialIndex (ref byte hangulCat)
2821 fillIndex [hangulCat]++;
2822 if (fillIndex [hangulCat] == 0) { // overflown
2824 fillIndex [hangulCat] = 0x2;
2828 // Reset fillIndex to fixed value and call AddLetterMap().
2829 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2831 fillIndex [category] = alphaWeight;
2832 AddLetterMap (c, category, 0);
2834 ArrayList al = latinMap [c] as ArrayList;
2838 foreach (int cp in al)
2839 AddLetterMap ((char) cp, category, 0);
2842 private void AddKanaMap (int i, byte voices)
2844 for (byte b = 0; b < voices; b++) {
2845 char c = (char) (i + b);
2846 byte arg = (byte) (b > 0 ? b + 2 : 0);
2848 AddLetterMapCore (c, 0x22, 0, arg);
2850 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2854 private void AddLetterMap (char c, byte category, byte updateCount)
2856 AddLetterMapCore (c, category, updateCount, 0);
2859 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2862 // <small> updates index
2863 c2 = ToSmallForm (c);
2865 AddCharMapGroup (c2, category, updateCount, level2);
2866 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2867 if (c2 != c && !map [(int) c2].Defined)
2868 AddLetterMapCore (c2, category, 0, level2);
2869 bool doUpdate = true;
2870 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2873 AddCharMapGroup (c, category, 0, level2);
2875 fillIndex [category] += updateCount;
2878 private bool AddCharMap (char c, byte category, byte increment)
2880 return AddCharMap (c, category, increment, 0);
2883 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2885 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2886 return false; // do nothing
2887 map [(int) c] = new CharMapEntry (category,
2888 category == 1 ? alt : fillIndex [category],
2889 category == 1 ? fillIndex [category] : alt);
2890 fillIndex [category] += increment;
2894 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2896 char c2 = ToSmallFormTail (c);
2898 AddCharMap (c2, category, updateCount, 0);
2900 AddCharMap (c, category, updateCount, 0);
2902 c2 = ToFullWidthTail (c);
2904 AddCharMapGroupTail (c2, category, updateCount);
2908 // Adds characters to table in the order below
2909 // (+ increases weight):
2913 // <full> | <super> | <sub>
2914 // <circle> | <wide> (| <narrow>)
2918 // level2 is fixed (does not increase).
2919 int [] sameWeightItems = new int [] {
2920 DecompositionFraction,
2924 DecompositionCircle,
2926 DecompositionNarrow,
2928 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2930 if (map [(int) c].Defined)
2933 char small = char.MinValue;
2934 char vertical = char.MinValue;
2935 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2937 object smv = nfkd [(byte) DecompositionSmall];
2939 small = (char) ((int) smv);
2940 object vv = nfkd [(byte) DecompositionVertical];
2942 vertical = (char) ((int) vv);
2945 // <small> updates index
2946 if (small != char.MinValue)
2947 AddCharMap (small, category, updateCount);
2950 AddCharMap (c, category, 0, level2);
2953 foreach (int weight in sameWeightItems) {
2954 object wv = nfkd [(byte) weight];
2956 AddCharMap ((char) ((int) wv), category, 0, level2);
2960 // update index here.
2961 fillIndex [category] += updateCount;
2963 if (vertical != char.MinValue)
2964 AddCharMap (vertical, category, updateCount, level2);
2967 private void AddCharMapCJK (char c, ref byte category)
2969 AddCharMap (c, category, 0, 0);
2970 IncrementSequentialIndex (ref category);
2972 // Special. I wonder why but Windows skips 9E F9.
2973 if (category == 0x9E && fillIndex [category] == 0xF9)
2974 IncrementSequentialIndex (ref category);
2977 private void AddCharMapGroupCJK (char c, ref byte category)
2979 AddCharMapCJK (c, ref category);
2981 // LAMESPEC: see below.
2982 if (c == '\u5B78') {
2983 AddCharMapCJK ('\u32AB', ref category);
2984 AddCharMapCJK ('\u323B', ref category);
2986 if (c == '\u52DE') {
2987 AddCharMapCJK ('\u3298', ref category);
2988 AddCharMapCJK ('\u3238', ref category);
2991 AddCharMapCJK ('\u32A2', ref category);
2993 // Especially this mapping order totally does
2994 // not make sense to me.
2995 AddCharMapCJK ('\u32A9', ref category);
2997 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3000 for (byte weight = 0; weight <= 0x12; weight++) {
3001 object wv = nfkd [weight];
3006 // Special: they are ignored in this area.
3007 // FIXME: check if it is sane
3008 if (0xF900 <= w && w <= 0xFAD9)
3010 // LAMESPEC: on Windows some of CJK characters
3011 // in 3200-32B0 are incorrectly mapped. They
3012 // mix Chinise and Japanese Kanji when
3013 // ordering those characters.
3015 case 0x32A2: case 0x3298: case 0x3238:
3016 case 0x32A9: case 0x323B: case 0x32AB:
3020 AddCharMapCJK ((char) w, ref category);
3024 // For now it is only for 0x7 category.
3025 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3027 char small = char.MinValue;
3028 char vertical = char.MinValue;
3029 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3031 object smv = nfkd [(byte) DecompositionSmall];
3033 small = (char) ((int) smv);
3034 object vv = nfkd [(byte) DecompositionVertical];
3036 vertical = (char) ((int) vv);
3039 // <small> updates index
3040 if (small != char.MinValue)
3041 // SPECIAL CASE excluded (FIXME: why?)
3042 if (small != '\u2024')
3043 AddCharMap (small, category, updateCount);
3046 AddCharMap (c, category, updateCount, level2);
3048 // Since nfkdMap is problematic to have two or more
3049 // NFKD to an identical character, here I iterate all.
3050 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3051 if (decompLength [c2] == 1 &&
3052 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3053 switch (decompType [c2]) {
3054 case DecompositionCompat:
3055 AddCharMap ((char) c2, category, updateCount, level2);
3061 if (vertical != char.MinValue)
3062 // SPECIAL CASE excluded (FIXME: why?)
3063 if (vertical != '\uFE33' && vertical != '\uFE34')
3064 AddCharMap (vertical, category, updateCount, level2);
3067 private void AddArabicCharMap (char c)
3070 byte updateCount = 1;
3074 AddCharMap (c, category, 0, level2);
3076 // Since nfkdMap is problematic to have two or more
3077 // NFKD to an identical character, here I iterate all.
3078 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3079 if (decompLength [c2] == 0)
3081 int idx = decompIndex [c2] + decompLength [c2] - 1;
3082 if ((int) (decompValues [idx]) == (int) c)
3083 AddCharMap ((char) c2, category,
3086 fillIndex [category] += updateCount;
3089 char ToFullWidth (char c)
3091 return ToDecomposed (c, DecompositionFull, false);
3094 char ToFullWidthTail (char c)
3096 return ToDecomposed (c, DecompositionFull, true);
3099 char ToSmallForm (char c)
3101 return ToDecomposed (c, DecompositionSmall, false);
3104 char ToSmallFormTail (char c)
3106 return ToDecomposed (c, DecompositionSmall, true);
3109 char ToDecomposed (char c, byte d, bool tail)
3111 if (decompType [(int) c] != d)
3113 int idx = decompIndex [(int) c];
3115 idx += decompLength [(int) c] - 1;
3116 return (char) decompValues [idx];
3119 bool ExistsJIS (int cp)
3121 foreach (JISCharacter j in jisJapanese)
3129 #region Level 3 properties (Case/Width)
3131 private byte ComputeLevel3Weight (char c)
3133 byte b = ComputeLevel3WeightRaw (c);
3134 return b > 0 ? (byte) (b + 2) : b;
3137 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3140 if ('\u3192' <= c && c <= '\u319F')
3142 // Japanese reading marks
3143 if (c == '\u3001' || c == '\u3002')
3146 if ('\u11A8' <= c && c <= '\u11F9')
3148 if ('\uFFA0' <= c && c <= '\uFFDC')
3150 if ('\u3130' <= c && c <= '\u3164')
3152 if ('\u3165' <= c && c <= '\u318E')
3154 // Georgian Capital letters
3155 if ('\u10A0' <= c && c <= '\u10C5')
3158 if ('\u2776' <= c && c <= '\u277F')
3160 if ('\u2780' <= c && c <= '\u2789')
3162 if ('\u2776' <= c && c <= '\u2793')
3164 if ('\u2160' <= c && c <= '\u216F')
3166 if ('\u2181' <= c && c <= '\u2182')
3169 if ('\u2135' <= c && c <= '\u2138')
3171 if ('\uFE80' <= c && c < '\uFF00') {
3172 // 2(Isolated)/8(Final)/0x18(Medial)
3173 switch (decompType [(int) c]) {
3174 case DecompositionIsolated:
3176 case DecompositionFinal:
3178 case DecompositionMedial:
3183 // actually I dunno the reason why they have weights.
3206 switch (decompType [(int) c]) {
3207 case DecompositionWide: // <wide>
3208 case DecompositionSub: // <sub>
3209 case DecompositionSuper: // <super>
3210 ret |= decompType [(int) c];
3213 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3215 if (isUppercase [(int) c]) // DerivedCoreProperties
3225 static bool IsIgnorable (int i)
3227 if (unicodeAge [i] >= 3.1)
3229 switch (char.GetUnicodeCategory ((char) i)) {
3230 case UnicodeCategory.OtherNotAssigned:
3231 case UnicodeCategory.Format:
3238 // FIXME: In the future use DerivedAge.txt to examine character
3239 // versions and set those ones that have higher version than
3240 // 1.0 as ignorable.
3241 static bool IsIgnorable (int i)
3245 // I guess, those characters are added between
3246 // Unicode 1.0 (LCMapString) and Unicode 3.1
3247 // (UnicodeCategory), so they used to be
3248 // something like OtherNotAssigned as of Unicode 1.1.
3249 case 0x2df: case 0x387:
3250 case 0x3d7: case 0x3d8: case 0x3d9:
3251 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3252 case 0x400: case 0x40d: case 0x450: case 0x45d:
3253 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3254 case 0x653: case 0x654: case 0x655: case 0x66d:
3256 case 0x1e9b: case 0x202f: case 0x20ad:
3257 case 0x20ae: case 0x20af:
3258 case 0x20e2: case 0x20e3:
3259 case 0x2139: case 0x213a: case 0x2183:
3260 case 0x2425: case 0x2426: case 0x2619:
3261 case 0x2670: case 0x2671: case 0x3007:
3262 case 0x3190: case 0x3191:
3263 case 0xfffc: case 0xfffd:
3265 // exceptional characters filtered by the
3266 // following conditions. Originally those exceptional
3267 // ranges are incorrect (they should not be ignored)
3268 // and most of those characters are unfortunately in
3270 case 0x4d8: case 0x4d9:
3271 case 0x4e8: case 0x4e9:
3273 case 0x3036: case 0x303f:
3274 case 0x337b: case 0xfb1e:
3279 // The whole Sinhala characters.
3280 0x0D82 <= i && i <= 0x0DF4
3281 // The whole Tibetan characters.
3282 || 0x0F00 <= i && i <= 0x0FD1
3283 // The whole Myanmar characters.
3284 || 0x1000 <= i && i <= 0x1059
3285 // The whole Etiopic, Cherokee,
3286 // Canadian Syllablic, Ogham, Runic,
3287 // Tagalog, Hanunoo, Philippine,
3288 // Buhid, Tagbanwa, Khmer and Mongorian
3290 || 0x1200 <= i && i <= 0x1DFF
3291 // Greek extension characters.
3292 || 0x1F00 <= i && i <= 0x1FFF
3293 // The whole Braille characters.
3294 || 0x2800 <= i && i <= 0x28FF
3295 // CJK radical characters.
3296 || 0x2E80 <= i && i <= 0x2EF3
3297 // Kangxi radical characters.
3298 || 0x2F00 <= i && i <= 0x2FD5
3299 // Ideographic description characters.
3300 || 0x2FF0 <= i && i <= 0x2FFB
3301 // Bopomofo letter and final
3302 || 0x31A0 <= i && i <= 0x31B7
3303 // White square with quadrant characters.
3304 || 0x25F0 <= i && i <= 0x25F7
3305 // Ideographic telegraph symbols.
3306 || 0x32C0 <= i && i <= 0x32CB
3307 || 0x3358 <= i && i <= 0x3370
3308 || 0x33E0 <= i && i <= 0x33FF
3309 // The whole YI characters.
3310 || 0xA000 <= i && i <= 0xA48C
3311 || 0xA490 <= i && i <= 0xA4C6
3312 // American small ligatures
3313 || 0xFB13 <= i && i <= 0xFB17
3314 // hebrew, arabic, variation selector.
3315 || 0xFB1D <= i && i <= 0xFE2F
3316 // Arabic ligatures.
3317 || 0xFEF5 <= i && i <= 0xFEFC
3318 // FIXME: why are they excluded?
3319 || 0x01F6 <= i && i <= 0x01F9
3320 || 0x0218 <= i && i <= 0x0233
3321 || 0x02A9 <= i && i <= 0x02AD
3322 || 0x02EA <= i && i <= 0x02EE
3323 || 0x0349 <= i && i <= 0x036F
3324 || 0x0488 <= i && i <= 0x048F
3325 || 0x04D0 <= i && i <= 0x04FF
3326 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3327 || 0x06D6 <= i && i <= 0x06ED
3328 || 0x06FA <= i && i <= 0x06FE
3329 || 0x2048 <= i && i <= 0x204D
3330 || 0x20e4 <= i && i <= 0x20ea
3331 || 0x213C <= i && i <= 0x214B
3332 || 0x21EB <= i && i <= 0x21FF
3333 || 0x22F2 <= i && i <= 0x22FF
3334 || 0x237B <= i && i <= 0x239A
3335 || 0x239B <= i && i <= 0x23CF
3336 || 0x24EB <= i && i <= 0x24FF
3337 || 0x2596 <= i && i <= 0x259F
3338 || 0x25F8 <= i && i <= 0x25FF
3339 || 0x2672 <= i && i <= 0x2689
3340 || 0x2768 <= i && i <= 0x2775
3341 || 0x27d0 <= i && i <= 0x27ff
3342 || 0x2900 <= i && i <= 0x2aff
3343 || 0x3033 <= i && i <= 0x303F
3344 || 0x31F0 <= i && i <= 0x31FF
3345 || 0x3250 <= i && i <= 0x325F
3346 || 0x32B1 <= i && i <= 0x32BF
3347 || 0x3371 <= i && i <= 0x337B
3348 || 0xFA30 <= i && i <= 0xFA6A
3352 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3354 case UnicodeCategory.PrivateUse:
3355 case UnicodeCategory.Surrogate:
3357 // ignored by nature
3358 case UnicodeCategory.Format:
3359 case UnicodeCategory.OtherNotAssigned:
3366 // To check IsIgnorable sanity, try the driver below under MS.NET.
3369 public static void Main ()
3371 for (int i = 0; i <= char.MaxValue; i++)
3372 Dump (i, IsIgnorable (i));
3375 static void Dump (int i, bool ignore)
3377 switch (Char.GetUnicodeCategory ((char) i)) {
3378 case UnicodeCategory.PrivateUse:
3379 case UnicodeCategory.Surrogate:
3380 return; // check nothing
3384 string s2 = new string ((char) i, 10);
3385 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3386 if ((ret == 0) == ignore)
3388 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3391 #endregion // IsIgnorable
3393 #region IsIgnorableSymbol
3394 static bool IsIgnorableSymbol (int i)
3396 if (IsIgnorable (i))
3401 case 0x00b5: case 0x01C0: case 0x01C1:
3402 case 0x01C2: case 0x01C3: case 0x01F6:
3403 case 0x01F7: case 0x01F8: case 0x01F9:
3404 case 0x02D0: case 0x02EE: case 0x037A:
3405 case 0x03D7: case 0x03F3:
3406 case 0x0400: case 0x040d:
3407 case 0x0450: case 0x045d:
3408 case 0x048C: case 0x048D:
3409 case 0x048E: case 0x048F:
3410 case 0x0587: case 0x0640: case 0x06E5:
3411 case 0x06E6: case 0x06FA: case 0x06FB:
3412 case 0x06FC: case 0x093D: case 0x0950:
3413 case 0x1E9B: case 0x2139: case 0x3006:
3414 case 0x3033: case 0x3034: case 0x3035:
3415 case 0xFE7E: case 0xFE7F:
3417 case 0x16EE: case 0x16EF: case 0x16F0:
3419 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3420 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3421 case 0x3038: // HANGZHOU NUMERAL TEN
3422 case 0x3039: // HANGZHOU NUMERAL TWENTY
3423 case 0x303a: // HANGZHOU NUMERAL THIRTY
3429 case 0x02B9: case 0x02BA: case 0x02C2:
3430 case 0x02C3: case 0x02C4: case 0x02C5:
3431 case 0x02C8: case 0x02CC: case 0x02CD:
3432 case 0x02CE: case 0x02CF: case 0x02D2:
3433 case 0x02D3: case 0x02D4: case 0x02D5:
3434 case 0x02D6: case 0x02D7: case 0x02DE:
3435 case 0x02E5: case 0x02E6: case 0x02E7:
3436 case 0x02E8: case 0x02E9:
3437 case 0x309B: case 0x309C:
3439 case 0x055A: // American Apos
3440 case 0x05C0: // Hebrew Punct
3441 case 0x0E4F: // Thai FONGMAN
3442 case 0x0E5A: // Thai ANGKHANKHU
3443 case 0x0E5B: // Thai KHOMUT
3445 case 0x09F2: // Bengali Rupee Mark
3446 case 0x09F3: // Bengali Rupee Sign
3448 case 0x221e: // INF.
3457 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3459 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3460 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3465 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3467 case UnicodeCategory.Surrogate:
3468 return false; // inconsistent
3470 case UnicodeCategory.SpacingCombiningMark:
3471 case UnicodeCategory.EnclosingMark:
3472 case UnicodeCategory.NonSpacingMark:
3473 case UnicodeCategory.PrivateUse:
3475 if (0x064B <= i && i <= 0x0652) // Arabic
3479 case UnicodeCategory.Format:
3480 case UnicodeCategory.OtherNotAssigned:
3487 // latin in a circle
3488 0x249A <= i && i <= 0x24E9
3489 || 0x2100 <= i && i <= 0x2132
3491 || 0x3196 <= i && i <= 0x31A0
3493 || 0x3200 <= i && i <= 0x321C
3495 || 0x322A <= i && i <= 0x3243
3497 || 0x3260 <= i && i <= 0x32B0
3498 || 0x32D0 <= i && i <= 0x3357
3499 || 0x337B <= i && i <= 0x33DD
3501 use = !Char.IsLetterOrDigit ((char) i);
3505 // This "Digit" rule is mystery.
3506 // It filters some symbols out.
3507 if (Char.IsLetterOrDigit ((char) i))
3509 if (Char.IsNumber ((char) i))
3511 if (Char.IsControl ((char) i)
3512 || Char.IsSeparator ((char) i)
3513 || Char.IsPunctuation ((char) i))
3515 if (Char.IsSymbol ((char) i))
3518 // FIXME: should check more
3523 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3525 public static void Main ()
3527 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3528 for (int i = 0; i <= char.MaxValue; i++) {
3529 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3530 if (uc == UnicodeCategory.Surrogate)
3533 bool ret = IsIgnorableSymbol (i);
3535 string s1 = "TEST ";
3536 string s2 = "TEST " + (char) i;
3538 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3540 if (ret != (result == 0))
3541 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3542 ret ? "should not ignore" :
3551 static bool IsIgnorableNonSpacing (int i)
3553 if (IsIgnorable (i))
3557 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3558 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3559 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3561 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3562 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3563 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3564 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3565 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3566 case 0x0CCD: case 0x0E4E:
3570 if (0x02b9 <= i && i <= 0x02c5
3571 || 0x02cc <= i && i <= 0x02d7
3572 || 0x02e4 <= i && i <= 0x02ef
3573 || 0x20DD <= i && i <= 0x20E0
3577 if (0x064B <= i && i <= 0x00652
3578 || 0x0941 <= i && i <= 0x0948
3579 || 0x0AC1 <= i && i <= 0x0ACD
3580 || 0x0C3E <= i && i <= 0x0C4F
3581 || 0x0E31 <= i && i <= 0x0E3F
3585 return Char.GetUnicodeCategory ((char) i) ==
3586 UnicodeCategory.NonSpacingMark;
3589 // We can reuse IsIgnorableSymbol testcode
3590 // for IsIgnorableNonSpacing.
3596 public byte Category;
3598 public byte Level2; // It is always single byte.
3599 public bool Defined;
3601 public CharMapEntry (byte category, byte level1, byte level2)
3603 Category = category;
3612 public readonly int CP;
3613 public readonly int JIS;
3615 public JISCharacter (int cp, int cpJIS)
3622 class JISComparer : IComparer
3624 public static readonly JISComparer Instance =
3627 public int Compare (object o1, object o2)
3629 JISCharacter j1 = (JISCharacter) o1;
3630 JISCharacter j2 = (JISCharacter) o2;
3631 return j1.JIS - j2.JIS;
3635 class NonJISCharacter
3637 public readonly int CP;
3638 public readonly string Name;
3640 public NonJISCharacter (int cp, string name)
3647 class NonJISComparer : IComparer
3649 public static readonly NonJISComparer Instance =
3650 new NonJISComparer ();
3652 public int Compare (object o1, object o2)
3654 NonJISCharacter j1 = (NonJISCharacter) o1;
3655 NonJISCharacter j2 = (NonJISCharacter) o2;
3656 return string.CompareOrdinal (j1.Name, j2.Name);
3660 class DecimalDictionaryValueComparer : IComparer
3662 public static readonly DecimalDictionaryValueComparer Instance
3663 = new DecimalDictionaryValueComparer ();
3665 private DecimalDictionaryValueComparer ()
3669 public int Compare (object o1, object o2)
3671 DictionaryEntry e1 = (DictionaryEntry) o1;
3672 DictionaryEntry e2 = (DictionaryEntry) o2;
3673 // FIXME: in case of 0, compare decomposition categories
3674 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3677 int i1 = (int) e1.Key;
3678 int i2 = (int) e2.Key;
3683 class StringDictionaryValueComparer : IComparer
3685 public static readonly StringDictionaryValueComparer Instance
3686 = new StringDictionaryValueComparer ();
3688 private StringDictionaryValueComparer ()
3692 public int Compare (object o1, object o2)
3694 DictionaryEntry e1 = (DictionaryEntry) o1;
3695 DictionaryEntry e2 = (DictionaryEntry) o2;
3696 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3699 int i1 = (int) e1.Key;
3700 int i2 = (int) e2.Key;
3705 class UCAComparer : IComparer
3707 public static readonly UCAComparer Instance
3708 = new UCAComparer ();
3710 private UCAComparer ()
3714 public int Compare (object o1, object o2)
3716 char i1 = (char) o1;
3717 char i2 = (char) o2;
3719 int l1 = CollationElementTable.GetSortKeyCount (i1);
3720 int l2 = CollationElementTable.GetSortKeyCount (i2);
3721 int l = l1 > l2 ? l2 : l1;
3723 for (int i = 0; i < l; i++) {
3724 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3725 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3726 int v = k1.Primary - k2.Primary;
3729 v = k1.Secondary - k2.Secondary;
3732 v = k1.Thirtiary - k2.Thirtiary;
3735 v = k1.Quarternary - k2.Quarternary;
3748 ArrayList items = new ArrayList ();
3750 public Tailoring (int lcid)
3755 public Tailoring (int lcid, int alias)
3762 get { return lcid; }
3766 get { return alias; }
3769 public bool FrenchSort {
3770 get { return frenchSort; }
3771 set { frenchSort = value; }
3774 public void AddDiacriticalMap (byte target, byte replace)
3776 items.Add (new DiacriticalMap (target, replace));
3779 public void AddSortKeyMap (string source, byte [] sortkey)
3781 items.Add (new SortKeyMap (source, sortkey));
3784 public void AddReplacementMap (string source, string replace)
3786 items.Add (new ReplacementMap (source, replace));
3789 public char [] ItemToCharArray ()
3791 ArrayList al = new ArrayList ();
3792 foreach (ITailoringMap m in items)
3793 al.AddRange (m.ToCharArray ());
3794 return al.ToArray (typeof (char)) as char [];
3797 interface ITailoringMap
3799 char [] ToCharArray ();
3802 class DiacriticalMap : ITailoringMap
3804 public readonly byte Target;
3805 public readonly byte Replace;
3807 public DiacriticalMap (byte target, byte replace)
3813 public char [] ToCharArray ()
3815 char [] ret = new char [3];
3816 ret [0] = (char) 02; // kind:DiacriticalMap
3817 ret [1] = (char) Target;
3818 ret [2] = (char) Replace;
3823 class SortKeyMap : ITailoringMap
3825 public readonly string Source;
3826 public readonly byte [] SortKey;
3828 public SortKeyMap (string source, byte [] sortkey)
3834 public char [] ToCharArray ()
3836 char [] ret = new char [Source.Length + 7];
3837 ret [0] = (char) 01; // kind:SortKeyMap
3838 for (int i = 0; i < Source.Length; i++)
3839 ret [i + 1] = Source [i];
3841 for (int i = 0; i < 4; i++)
3842 ret [i + Source.Length + 2] = (char) SortKey [i];
3847 class ReplacementMap : ITailoringMap
3849 public readonly string Source;
3850 public readonly string Replace;
3852 public ReplacementMap (string source, string replace)
3858 public char [] ToCharArray ()
3860 char [] ret = new char [Source.Length + Replace.Length + 3];
3861 ret [0] = (char) 03; // kind:ReplaceMap
3863 for (int i = 0; i < Source.Length; i++)
3864 ret [pos++] = Source [i];
3867 for (int i = 0; i < Replace.Length; i++)
3868 ret [pos++] = Replace [i];