3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
102 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;",
105 "WITH DOT ABOVE;", " MIDDLE DOT;",
106 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
108 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
109 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
110 "WITH OGONEK;", "WITH CEDILLA;",
112 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
113 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
115 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
116 " DIAERESIS AND GRAVE;",
118 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
119 " MACRON AND ACUTE;",
120 " MACRON AND GRAVE;",
122 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
123 " RING ABOVE AND ACUTE",
124 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
125 " CIRCUMFLEX AND TILDE",
126 " TILDE AND DIAERESIS",
129 " CEDILLA AND BREVE",
130 " OGONEK AND MACRON",
133 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
137 " PRECEDED BY APOSTROPHE",
139 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
142 " RETROFLEX;", "DIAERESIS BELOW",
145 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
146 " BREVE BELOW;", " HORN AND GRAVE",
149 " DOT BELOW AND DOT ABOVE",
150 " RIGHT HALF RING", " HORN AND TILDE",
151 " CIRCUMFLEX AND DOT BELOW",
152 " BREVE AND DOT BELOW",
153 " DOT BELOW AND MACRON",
155 " HORN AND HOOK ABOVE",
157 // CIRCLED, PARENTHESIZED and so on
158 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
159 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
160 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
162 byte [] diacriticWeights = new byte [] {
168 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
169 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
171 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
172 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
174 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
175 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
177 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
178 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
180 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
181 0x69, 0x69, 0x6A, 0x6D, 0x6E,
183 // CIRCLED, PARENTHESIZED and so on.
184 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
188 int [] numberSecondaryWeightBounds = new int [] {
189 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
190 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
191 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
192 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
193 0xE50, 0xE60, 0xED0, 0xEE0
196 char [] orderedGurmukhi;
197 char [] orderedGujarati;
198 char [] orderedGeorgian;
199 char [] orderedThaana;
201 static readonly char [] orderedTamilConsonants = new char [] {
202 // based on traditional Tamil consonants, except for
203 // Grantha (where Microsoft breaks traditionalism).
204 // http://www.angelfire.com/empire/thamizh/padanGaL
205 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
206 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
207 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
208 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
211 // cp -> character name (only for some characters)
212 ArrayList sortableCharNames = new ArrayList ();
214 // cp -> arrow value (int)
215 ArrayList arrowValues = new ArrayList ();
217 // cp -> box value (int)
218 ArrayList boxValues = new ArrayList ();
220 // cp -> level1 value
221 Hashtable arabicLetterPrimaryValues = new Hashtable ();
224 Hashtable arabicNameMap = new Hashtable ();
226 // cp -> Hashtable [decompType] -> cp
227 Hashtable nfkdMap = new Hashtable ();
229 // Latin letter -> ArrayList [int]
230 Hashtable latinMap = new Hashtable ();
232 ArrayList jisJapanese = new ArrayList ();
233 ArrayList nonJisJapanese = new ArrayList ();
235 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
236 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
237 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
238 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
239 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
241 byte [] ignorableFlags = new byte [char.MaxValue + 1];
243 static double [] unicodeAge = new double [char.MaxValue + 1];
245 ArrayList tailorings = new ArrayList ();
247 void Run (string [] args)
249 string dirname = args.Length == 0 ? "downloaded" : args [0];
250 ParseSources (dirname);
251 Console.Error.WriteLine ("parse done.");
253 ModifyParsedValues ();
255 Console.Error.WriteLine ("generation done.");
257 Console.Error.WriteLine ("serialization done.");
259 StreamWriter sw = new StreamWriter ("agelog.txt");
260 for (int i = 0; i < char.MaxValue; i++) {
261 bool shouldBe = false;
262 switch (Char.GetUnicodeCategory ((char) i)) {
263 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
264 shouldBe = true; break;
266 if (unicodeAge [i] >= 3.1)
268 //if (IsIgnorable (i) != shouldBe)
269 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
275 byte [] CompressArray (byte [] source, CodePointIndexer i)
277 return (byte []) CodePointIndexer.CompressArray (
278 source, typeof (byte), i);
281 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
283 return (ushort []) CodePointIndexer.CompressArray (
284 source, typeof (ushort), i);
290 SerializeTailorings ();
292 byte [] categories = new byte [map.Length];
293 byte [] level1 = new byte [map.Length];
294 byte [] level2 = new byte [map.Length];
295 byte [] level3 = new byte [map.Length];
296 ushort [] widthCompat = new ushort [map.Length];
297 for (int i = 0; i < map.Length; i++) {
298 categories [i] = map [i].Category;
299 level1 [i] = map [i].Level1;
300 level2 [i] = map [i].Level2;
301 level3 [i] = ComputeLevel3Weight ((char) i);
302 // For Japanese Half-width characters, don't
303 // map widthCompat. It is IgnoreKanaType that
304 // handles those width differences.
305 if (0xFF6D <= i && i <= 0xFF9D)
307 switch (decompType [i]) {
308 case DecompositionNarrow:
309 case DecompositionWide:
310 case DecompositionSuper:
311 case DecompositionSub:
312 // they are always 1 char
313 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
319 ignorableFlags = CompressArray (ignorableFlags,
320 MSCompatUnicodeTableUtil.Ignorable);
321 categories = CompressArray (categories,
322 MSCompatUnicodeTableUtil.Category);
323 level1 = CompressArray (level1,
324 MSCompatUnicodeTableUtil.Level1);
325 level2 = CompressArray (level2,
326 MSCompatUnicodeTableUtil.Level2);
327 level3 = CompressArray (level3,
328 MSCompatUnicodeTableUtil.Level3);
329 widthCompat = (ushort []) CodePointIndexer.CompressArray (
330 widthCompat, typeof (ushort),
331 MSCompatUnicodeTableUtil.WidthCompat);
332 cjkCHS = CompressArray (cjkCHS,
333 MSCompatUnicodeTableUtil.CjkCHS);
334 cjkCHT = CompressArray (cjkCHT,
335 MSCompatUnicodeTableUtil.Cjk);
336 cjkJA = CompressArray (cjkJA,
337 MSCompatUnicodeTableUtil.Cjk);
338 cjkKO = CompressArray (cjkKO,
339 MSCompatUnicodeTableUtil.Cjk);
340 cjkKOlv2 = CompressArray (cjkKOlv2,
341 MSCompatUnicodeTableUtil.Cjk);
344 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
346 MemoryStream ms = new MemoryStream ();
347 BinaryWriter binary = new BinaryWriter (ms);
348 binary.Write (ignorableFlags.Length);
350 for (int i = 0; i < ignorableFlags.Length; i++) {
351 byte value = ignorableFlags [i];
353 Result.Write ("{0},", value);
355 Result.Write ("0x{0:X02},", value);
357 binary.Write (value);
359 if ((i & 0xF) == 0xF)
360 Result.WriteLine ("// {0:X04}", i - 0xF);
362 Result.WriteLine ("};");
366 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
368 binary.Write (categories.Length);
370 for (int i = 0; i < categories.Length; i++) {
371 byte value = categories [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
377 binary.Write (value);
379 if ((i & 0xF) == 0xF)
380 Result.WriteLine ("// {0:X04}", i - 0xF);
382 Result.WriteLine ("};");
385 // Primary weight value
386 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
388 binary.Write (level1.Length);
390 for (int i = 0; i < level1.Length; i++) {
391 byte value = level1 [i];
393 Result.Write ("{0},", value);
395 Result.Write ("0x{0:X02},", value);
397 binary.Write (value);
399 if ((i & 0xF) == 0xF)
400 Result.WriteLine ("// {0:X04}", i - 0xF);
402 Result.WriteLine ("};");
406 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
408 binary.Write (level2.Length);
410 for (int i = 0; i < level2.Length; i++) {
411 byte value = level2 [i];
413 Result.Write ("{0},", value);
415 Result.Write ("0x{0:X02},", value);
417 binary.Write (value);
419 if ((i & 0xF) == 0xF)
420 Result.WriteLine ("// {0:X04}", i - 0xF);
422 Result.WriteLine ("};");
426 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
428 binary.Write (level3.Length);
430 for (int i = 0; i < level3.Length; i++) {
431 byte value = level3 [i];
433 Result.Write ("{0},", value);
435 Result.Write ("0x{0:X02},", value);
437 binary.Write (value);
439 if ((i & 0xF) == 0xF)
440 Result.WriteLine ("// {0:X04}", i - 0xF);
442 Result.WriteLine ("};");
445 // Width insensitivity mappings
446 // (for now it is more lightweight than dumping the
447 // entire NFKD table).
448 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
450 binary.Write (widthCompat.Length);
452 for (int i = 0; i < widthCompat.Length; i++) {
453 ushort value = widthCompat [i];
455 Result.Write ("{0},", value);
457 Result.Write ("0x{0:X02},", value);
459 binary.Write (value);
461 if ((i & 0xF) == 0xF)
462 Result.WriteLine ("// {0:X04}", i - 0xF);
464 Result.WriteLine ("};");
467 using (FileStream fs = File.Create ("../collation.core.bin")) {
468 byte [] array = ms.ToArray ();
469 fs.Write (array, 0, array.Length);
474 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
475 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
476 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
477 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
478 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
481 void SerializeCJK (string name, ushort [] cjk, int max)
483 int offset = 0;//char.MaxValue - cjk.Length;
484 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
486 MemoryStream ms = new MemoryStream ();
487 BinaryWriter binary = new BinaryWriter (ms);
488 binary.Write (cjk.Length);
490 for (int i = 0; i < cjk.Length; i++) {
491 if (i + offset == max)
493 ushort value = cjk [i];
495 Result.Write ("{0},", value);
497 Result.Write ("0x{0:X04},", value);
499 binary.Write (value);
501 if ((i & 0xF) == 0xF)
502 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
504 Result.WriteLine ("};");
507 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
508 byte [] array = ms.ToArray ();
509 fs.Write (array, 0, array.Length);
514 void SerializeCJK (string name, byte [] cjk, int max)
516 int offset = 0;//char.MaxValue - cjk.Length;
517 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
519 MemoryStream ms = new MemoryStream ();
520 BinaryWriter binary = new BinaryWriter (ms);
522 for (int i = 0; i < cjk.Length; i++) {
523 if (i + offset == max)
525 byte value = cjk [i];
527 Result.Write ("{0},", value);
529 Result.Write ("0x{0:X02},", value);
531 binary.Write (value);
533 if ((i & 0xF) == 0xF)
534 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
536 Result.WriteLine ("};");
539 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
540 byte [] array = ms.ToArray ();
541 fs.Write (array, 0, array.Length);
546 void SerializeTailorings ()
548 Hashtable indexes = new Hashtable ();
549 Hashtable counts = new Hashtable ();
550 Result.WriteLine ("static char [] tailorings = new char [] {");
553 MemoryStream ms = new MemoryStream ();
554 BinaryWriter binary = new BinaryWriter (ms);
556 foreach (Tailoring t in tailorings) {
559 Result.Write ("/*{0}*/", t.LCID);
560 indexes.Add (t.LCID, count);
561 char [] values = t.ItemToCharArray ();
562 counts.Add (t.LCID, values.Length);
563 foreach (char c in values) {
564 Result.Write ("'\\x{0:X}', ", (int) c);
565 if (++count % 16 == 0)
566 Result.WriteLine (" // {0:X04}", count - 16);
568 binary.Write ((ushort) c);
572 Result.WriteLine ("};");
574 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
576 byte [] rawdata = ms.ToArray ();
577 ms = new MemoryStream ();
578 binary = new BinaryWriter (ms);
579 binary.Write (tailorings.Count);
581 foreach (Tailoring t in tailorings) {
582 int target = t.Alias != 0 ? t.Alias : t.LCID;
583 if (!indexes.ContainsKey (target)) {
584 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
587 int idx = (int) indexes [target];
588 int cnt = (int) counts [target];
589 bool french = t.FrenchSort;
591 foreach (Tailoring t2 in tailorings)
592 if (t2.LCID == t.LCID)
593 french = t2.FrenchSort;
594 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
596 binary.Write (t.LCID);
599 binary.Write (french);
602 Result.WriteLine ("};");
604 binary.Write ((byte) 0xFF);
605 binary.Write ((byte) 0xFF);
606 binary.Write (rawdata.Length / 2);
607 binary.Write (rawdata, 0, rawdata.Length);
610 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
611 byte [] array = ms.ToArray ();
612 fs.Write (array, 0, array.Length);
619 void ParseSources (string dirname)
622 dirname + "/UnicodeData.txt";
623 string derivedCoreProps =
624 dirname + "/DerivedCoreProperties.txt";
626 dirname + "/Scripts.txt";
628 dirname + "/CP932.TXT";
630 dirname + "/DerivedAge.txt";
631 string chXML = dirname + "/common/collation/zh.xml";
632 string jaXML = dirname + "/common/collation/ja.xml";
633 string koXML = dirname + "/common/collation/ko.xml";
635 ParseDerivedAge (derivedAge);
639 ParseJISOrder (cp932); // in prior to ParseUnidata()
640 ParseUnidata (unidata);
642 ParseDerivedCoreProperties (derivedCoreProps);
643 ParseScripts (scripts);
644 ParseCJK (chXML, jaXML, koXML);
646 ParseTailorings ("mono-tailoring-source.txt");
649 void ParseTailorings (string filename)
653 using (StreamReader sr = new StreamReader (filename)) {
655 while (sr.Peek () >= 0) {
657 ProcessTailoringLine (ref t,
658 sr.ReadLine ().Trim ());
660 } catch (Exception) {
661 Console.Error.WriteLine ("ERROR at line {0}", line);
667 // For now this is enough.
668 string ParseTailoringSourceValue (string s)
670 StringBuilder sb = new StringBuilder ();
671 for (int i = 0; i < s.Length; i++) {
672 if (s.StartsWith ("\\u")) {
673 sb.Append ((char) int.Parse (
674 s.Substring (2, 4), NumberStyles.HexNumber),
681 return sb.ToString ();
684 void ProcessTailoringLine (ref Tailoring t, string s)
686 int idx = s.IndexOf ('#');
688 s = s.Substring (0, idx).Trim ();
689 if (s.Length == 0 || s [0] == '#')
692 idx = s.IndexOf ('=');
695 int.Parse (s.Substring (1, idx - 1)),
696 int.Parse (s.Substring (idx + 1)));
698 t = new Tailoring (int.Parse (s.Substring (1)));
702 if (s.StartsWith ("*FrenchSort")) {
706 string d = "*Diacritical";
707 if (s.StartsWith (d)) {
708 idx = s.IndexOf ("->");
709 t.AddDiacriticalMap (
710 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
711 NumberStyles.HexNumber),
712 byte.Parse (s.Substring (idx + 2).Trim (),
713 NumberStyles.HexNumber));
716 idx = s.IndexOf (':');
718 string source = s.Substring (0, idx).Trim ();
719 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
720 byte [] b = new byte [4];
721 for (int i = 0; i < 4; i++) {
725 b [i] = byte.Parse (l [i],
726 NumberStyles.HexNumber);
728 t.AddSortKeyMap (ParseTailoringSourceValue (source),
731 idx = s.IndexOf ('=');
733 t.AddReplacementMap (
734 ParseTailoringSourceValue (
735 s.Substring (0, idx).Trim ()),
736 ParseTailoringSourceValue (
737 s.Substring (idx + 1).Trim ()));
740 void ParseDerivedAge (string filename)
742 using (StreamReader file =
743 new StreamReader (filename)) {
744 while (file.Peek () >= 0) {
745 string s = file.ReadLine ();
746 int idx = s.IndexOf ('#');
748 s = s.Substring (0, idx);
749 idx = s.IndexOf (';');
753 string cpspec = s.Substring (0, idx);
754 idx = cpspec.IndexOf ("..");
755 NumberStyles nf = NumberStyles.HexNumber |
756 NumberStyles.AllowTrailingWhite;
757 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
758 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
759 string value = s.Substring (cpspec.Length + 1).Trim ();
762 if (cp > char.MaxValue)
765 double v = double.Parse (value);
766 for (int i = cp; i <= cpEnd; i++)
770 unicodeAge [0] = double.MaxValue; // never be supported
773 void ParseUnidata (string filename)
775 ArrayList decompValues = new ArrayList ();
776 using (StreamReader unidata =
777 new StreamReader (filename)) {
778 for (int line = 1; unidata.Peek () >= 0; line++) {
780 ProcessUnidataLine (unidata.ReadLine (), decompValues);
781 } catch (Exception) {
782 Console.Error.WriteLine ("**** At line " + line);
787 this.decompValues = (int [])
788 decompValues.ToArray (typeof (int));
791 char previousLatinTarget = char.MinValue;
792 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
794 void ProcessUnidataLine (string s, ArrayList decompValues)
796 int idx = s.IndexOf ('#');
798 s = s.Substring (0, idx);
799 idx = s.IndexOf (';');
802 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
803 string [] values = s.Substring (idx + 1).Split (';');
806 if (cp > char.MaxValue)
808 if (IsIgnorable (cp))
811 string name = values [0];
813 // SPECIAL CASE: rename some characters for diacritical
814 // remapping. FIXME: why are they different?
815 // FIXME: it's still not working.
816 if (cp == 0x018B || cp == 0x018C)
817 name = name.Replace ("TOPBAR", "STROKE");
820 if (s.IndexOf ("SMALL CAPITAL") > 0)
821 isSmallCapital [cp] = true;
823 // latin mapping by character name
824 if (s.IndexOf ("LATIN") >= 0) {
825 int lidx = s.IndexOf ("LETTER DOTLESS ");
826 int offset = lidx + 15;
828 lidx = s.IndexOf ("LETTER TURNED ");
832 lidx = s.IndexOf ("LETTER CAPITAL ");
836 lidx = s.IndexOf ("LETTER SCRIPT ");
840 lidx = s.IndexOf ("LETTER ");
843 char c = lidx > 0 ? s [offset] : char.MinValue;
844 char n = s [offset + 1];
845 char target = char.MinValue;
846 if ('A' <= c && c <= 'Z' &&
847 (n == ' ') || n == ';') {
849 // FIXME: After 'Z', I cannot reset this state.
850 previousLatinTarget = c == 'Z' ? char.MinValue : c;
853 if (s.Substring (offset).StartsWith ("ALPHA"))
855 else if (s.Substring (offset).StartsWith ("TONE SIX"))
857 else if (s.Substring (offset).StartsWith ("OPEN O"))
859 else if (s.Substring (offset).StartsWith ("SCHWA"))
861 else if (s.Substring (offset).StartsWith ("ENG"))
863 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
865 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
867 else if (s.Substring (offset).StartsWith ("TONE TWO"))
869 else if (s.Substring (offset).StartsWith ("ESH"))
872 // For remaining IPA chars, direct mapping is
875 case 0x0299: target = 'B'; break;
876 case 0x029A: target = 'E'; break;
877 case 0x029B: target = 'G'; break;
878 case 0x029C: target = 'H'; break;
879 case 0x029D: target = 'J'; break;
880 case 0x029E: target = 'K'; break;
881 case 0x029F: target = 'L'; break;
882 case 0x02A0: target = 'Q'; break;
883 case 0x02A7: target = 'T'; break;
884 case 0x02A8: target = 'T'; break;
887 if (target == char.MinValue)
888 target = previousLatinTarget;
890 if (target != char.MinValue) {
891 ArrayList entry = (ArrayList) latinMap [target];
893 entry = new ArrayList ();
894 latinMap [target] = entry;
897 // FIXME: This secondary weight is hack.
898 // They are here because they must not
899 // be identical to the corresponding
901 if (c != target && diacritical [cp] == 0) {
902 diacriticalOffset [c - 'A']++;
903 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
909 if (0x2000 <= cp && cp < 0x3000) {
911 // SPECIAL CASES. FIXME: why?
913 case 0x21C5: value = -1; break; // E2
914 case 0x261D: value = 1; break;
915 case 0x27A6: value = 3; break;
916 case 0x21B0: value = 7; break;
917 case 0x21B1: value = 3; break;
918 case 0x21B2: value = 7; break;
919 case 0x21B4: value = 5; break;
920 case 0x21B5: value = 7; break;
921 case 0x21B9: value = -1; break; // E1
922 case 0x21CF: value = 7; break;
923 case 0x21D0: value = 3; break;
925 string [] arrowTargets = new string [] {
939 for (int i = 1; value == 0 && i < arrowTargets.Length; i++) {
940 if (s.IndexOf (arrowTargets [i]) > 0 &&
941 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
942 s.IndexOf (" OVER") < 0
945 else if (s.IndexOf ("RIGHTWARDS") > 0 &&
946 s.IndexOf ("LEFTWARDS") > 0)
948 else if (s.IndexOf ("UPWARDS") > 0 &&
949 s.IndexOf ("DOWNWARDS") > 0)
953 arrowValues.Add (new DictionaryEntry (
958 if (0x2500 <= cp && cp < 0x2600) {
961 // up:1 down:2 right:4 left:8 vert:16 horiz:32
964 // [dr] [dl] [ur] [ul]
968 ArrayList flags = new ArrayList (new int [] {
971 4 + 2, 8 + 2, 4 + 1, 8 + 1,
972 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
973 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
974 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
976 byte [] offsets = new byte [] {
983 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
985 if (s.IndexOf (" UP") >= 0)
987 if (s.IndexOf (" DOWN") >= 0)
989 if (s.IndexOf (" RIGHT") >= 0)
991 if (s.IndexOf (" LEFT") >= 0)
993 if (s.IndexOf (" VERTICAL") >= 0)
995 if (s.IndexOf (" HORIZONTAL") >= 0)
998 int fidx = flags.IndexOf (flag);
999 value = fidx < 0 ? fidx : offsets [fidx];
1000 } else if (s.IndexOf ("BLOCK") >= 0) {
1001 if (s.IndexOf ("ONE EIGHTH") >= 0)
1003 else if (s.IndexOf ("ONE QUARTER") >= 0)
1005 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1007 else if (s.IndexOf ("HALF") >= 0)
1009 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1011 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1013 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1018 else if (s.IndexOf ("SHADE") >= 0)
1020 else if (s.IndexOf ("SQUARE") >= 0)
1021 value = 0xBC - 0xE5;
1022 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1023 value = 0xBE - 0xE5;
1024 else if (s.IndexOf ("RECTANGLE") >= 0)
1025 value = 0xBD - 0xE5;
1026 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1027 value = 0xBF - 0xE5;
1028 else if (s.IndexOf ("TRIANGLE") >= 0) {
1029 if (s.IndexOf ("UP-POINTING") >= 0)
1030 value = 0xC0 - 0xE5;
1031 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1032 value = 0xC1 - 0xE5;
1033 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1034 value = 0xC2 - 0xE5;
1035 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1036 value = 0xC3 - 0xE5;
1038 else if (s.IndexOf ("POINTER") >= 0) {
1039 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1040 value = 0xC4 - 0xE5;
1041 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1042 value = 0xC5 - 0xE5;
1044 else if (s.IndexOf ("DIAMOND") >= 0)
1045 value = 0xC6 - 0xE5;
1046 else if (s.IndexOf ("FISHEYE") >= 0)
1047 value = 0xC7 - 0xE5;
1048 else if (s.IndexOf ("LOZENGE") >= 0)
1049 value = 0xC8 - 0xE5;
1050 else if (s.IndexOf ("BULLSEYE") >= 0)
1051 value = 0xC9 - 0xE5;
1052 else if (s.IndexOf ("CIRCLE") >= 0) {
1053 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1054 value = 0xCA - 0xE5;
1055 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1056 value = 0xCB - 0xE5;
1058 value = 0xC9 - 0xE5;
1060 if (0x25DA <= cp && cp <= 0x25E5)
1061 value = 0xCD + cp - 0x25DA - 0xE5;
1063 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1065 case 0x2571: value = 0xF; break;
1066 case 0x2572: value = 0x10; break;
1067 case 0x2573: value = 0x11; break;
1070 boxValues.Add (new DictionaryEntry (
1074 // For some characters store the name and sort later
1075 // to determine sorting.
1076 if (0x2100 <= cp && cp <= 0x213F &&
1077 Char.IsSymbol ((char) cp))
1078 sortableCharNames.Add (
1079 new DictionaryEntry (cp, name));
1080 else if (0x3380 <= cp && cp <= 0x33DD)
1081 sortableCharNames.Add (new DictionaryEntry (
1082 cp, name.Substring (7)));
1084 if (Char.GetUnicodeCategory ((char) cp) ==
1085 UnicodeCategory.MathSymbol) {
1086 if (name.StartsWith ("CIRCLED "))
1087 diacritical [cp] = 0xEE;
1088 if (name.StartsWith ("SQUARED "))
1089 diacritical [cp] = 0xEF;
1092 // diacritical weights by character name
1093 if (diacritics.Length != diacriticWeights.Length)
1094 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1095 for (int d = 0; d < diacritics.Length; d++) {
1096 if (s.IndexOf (diacritics [d]) > 0) {
1097 diacritical [cp] += diacriticWeights [d];
1098 if (s.IndexOf ("COMBINING") >= 0)
1099 diacritical [cp] -= (byte) 2;
1102 // also process "COMBINING blah" here
1103 // For now it is limited to cp < 0x0370
1104 // if (cp < 0x0300 || cp >= 0x0370)
1106 string tmp = diacritics [d].TrimEnd (';');
1107 if (tmp.IndexOf ("WITH ") == 0)
1108 tmp = tmp.Substring (4);
1109 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1111 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1115 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1117 // Two-step grep required for it.
1118 if (s.IndexOf ("FULL STOP") > 0 &&
1119 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1120 diacritical [cp] |= 0xF4;
1121 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1122 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1123 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1125 // Arabic letter name
1126 if (0x0621 <= cp && cp <= 0x064A &&
1127 Char.GetUnicodeCategory ((char) cp)
1128 == UnicodeCategory.OtherLetter) {
1129 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1134 // hamza, waw, yeh ... special cases.
1139 value = 0x77; // special cases.
1142 // Get primary letter name i.e.
1143 // XXX part of ARABIC LETTER XXX yyy
1144 // e.g. that of "TEH MARBUTA" is "TEH".
1147 // 0x0640 is special: it does
1148 // not start with ARABIC LETTER
1150 name.Substring (14);
1151 int tmpIdx = letterName.IndexOf (' ');
1152 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1153 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1154 if (arabicNameMap.ContainsKey (letterName))
1155 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1157 arabicNameMap [letterName] = cp;
1160 arabicLetterPrimaryValues [cp] = value;
1163 // Japanese square letter
1164 if (0x3300 <= cp && cp <= 0x3357)
1165 if (!ExistsJIS (cp))
1166 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1168 // normalizationType
1169 string decomp = values [4];
1170 idx = decomp.IndexOf ('<');
1172 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1174 decompType [cp] = DecompositionFull;
1177 decompType [cp] = DecompositionSub;
1180 decompType [cp] = DecompositionSuper;
1183 decompType [cp] = DecompositionSmall;
1186 decompType [cp] = DecompositionIsolated;
1189 decompType [cp] = DecompositionInitial;
1192 decompType [cp] = DecompositionFinal;
1195 decompType [cp] = DecompositionMedial;
1198 decompType [cp] = DecompositionNoBreak;
1201 decompType [cp] = DecompositionCompat;
1204 decompType [cp] = DecompositionFraction;
1207 decompType [cp] = DecompositionFont;
1210 decompType [cp] = DecompositionCircle;
1213 decompType [cp] = DecompositionSquare;
1216 decompType [cp] = DecompositionWide;
1219 decompType [cp] = DecompositionNarrow;
1222 decompType [cp] = DecompositionVertical;
1225 throw new Exception ("Support NFKD type : " + decomp);
1229 decompType [cp] = DecompositionCanonical;
1230 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1231 if (decomp.Length > 0) {
1233 string [] velems = decomp.Split (' ');
1234 int didx = decompValues.Count;
1235 decompIndex [cp] = didx;
1236 foreach (string v in velems)
1237 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1238 decompLength [cp] = velems.Length;
1240 // [decmpType] -> this_cp
1241 int targetCP = (int) decompValues [didx];
1242 // for "(x)" it specially maps to 'x' .
1243 // FIXME: check if it is sane
1244 if (velems.Length == 3 &&
1245 (int) decompValues [didx] == '(' &&
1246 (int) decompValues [didx + 2] == ')')
1247 targetCP = (int) decompValues [didx + 1];
1248 // special: 0x215F "1/"
1249 else if (cp == 0x215F)
1251 else if (velems.Length > 1 &&
1252 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1253 // skip them, except for CJK ideograph compat
1256 if (targetCP != 0) {
1257 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1258 if (entry == null) {
1259 entry = new Hashtable ();
1260 nfkdMap [targetCP] = entry;
1262 entry [(byte) decompType [cp]] = cp;
1266 if (values [5].Length > 0)
1267 decimalValue [cp] = decimal.Parse (values [5]);
1268 else if (values [6].Length > 0)
1269 decimalValue [cp] = decimal.Parse (values [6]);
1270 else if (values [7].Length > 0) {
1271 string decstr = values [7];
1272 idx = decstr.IndexOf ('/');
1273 if (cp == 0x215F) // special. "1/"
1274 decimalValue [cp] = 0x1;
1278 decimal.Parse (decstr.Substring (0, idx))
1279 / decimal.Parse (decstr.Substring (idx + 1));
1280 else if (decstr [0] == '(' &&
1281 decstr [decstr.Length - 1] == ')')
1284 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1285 else if (decstr [decstr.Length - 1] == '.')
1288 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1290 decimalValue [cp] = decimal.Parse (decstr);
1294 void ParseDerivedCoreProperties (string filename)
1297 using (StreamReader file =
1298 new StreamReader (filename)) {
1299 for (int line = 1; file.Peek () >= 0; line++) {
1301 ProcessDerivedCorePropLine (file.ReadLine ());
1302 } catch (Exception) {
1303 Console.Error.WriteLine ("**** At line " + line);
1310 void ProcessDerivedCorePropLine (string s)
1312 int idx = s.IndexOf ('#');
1314 s = s.Substring (0, idx);
1315 idx = s.IndexOf (';');
1318 string cpspec = s.Substring (0, idx);
1319 idx = cpspec.IndexOf ("..");
1320 NumberStyles nf = NumberStyles.HexNumber |
1321 NumberStyles.AllowTrailingWhite;
1322 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1323 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1324 string value = s.Substring (cpspec.Length + 1).Trim ();
1327 if (cp > char.MaxValue)
1332 for (int x = cp; x <= cpEnd; x++)
1333 isUppercase [x] = true;
1338 void ParseScripts (string filename)
1340 ArrayList gurmukhi = new ArrayList ();
1341 ArrayList gujarati = new ArrayList ();
1342 ArrayList georgian = new ArrayList ();
1343 ArrayList thaana = new ArrayList ();
1345 using (StreamReader file =
1346 new StreamReader (filename)) {
1347 while (file.Peek () >= 0) {
1348 string s = file.ReadLine ();
1349 int idx = s.IndexOf ('#');
1351 s = s.Substring (0, idx);
1352 idx = s.IndexOf (';');
1356 string cpspec = s.Substring (0, idx);
1357 idx = cpspec.IndexOf ("..");
1358 NumberStyles nf = NumberStyles.HexNumber |
1359 NumberStyles.AllowTrailingWhite;
1360 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1361 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1362 string value = s.Substring (cpspec.Length + 1).Trim ();
1365 if (cp > char.MaxValue)
1370 for (int x = cp; x <= cpEnd; x++)
1371 if (!IsIgnorable (x))
1372 gurmukhi.Add ((char) x);
1375 for (int x = cp; x <= cpEnd; x++)
1376 if (!IsIgnorable (x))
1377 gujarati.Add ((char) x);
1380 for (int x = cp; x <= cpEnd; x++)
1381 if (!IsIgnorable (x))
1382 georgian.Add ((char) x);
1385 for (int x = cp; x <= cpEnd; x++)
1386 if (!IsIgnorable (x))
1387 thaana.Add ((char) x);
1392 gurmukhi.Sort (UCAComparer.Instance);
1393 gujarati.Sort (UCAComparer.Instance);
1394 georgian.Sort (UCAComparer.Instance);
1395 thaana.Sort (UCAComparer.Instance);
1396 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1397 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1398 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1399 orderedThaana = (char []) thaana.ToArray (typeof (char));
1402 void ParseJISOrder (string filename)
1406 using (StreamReader file =
1407 new StreamReader (filename)) {
1408 for (;file.Peek () >= 0; line++)
1409 ProcessJISOrderLine (file.ReadLine ());
1411 } catch (Exception) {
1412 Console.Error.WriteLine ("---- line {0}", line);
1417 char [] ws = new char [] {'\t', ' '};
1419 void ProcessJISOrderLine (string s)
1421 int idx = s.IndexOf ('#');
1423 s = s.Substring (0, idx).Trim ();
1426 idx = s.IndexOfAny (ws);
1429 // They start with "0x" so cut them out.
1430 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1431 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1432 jisJapanese.Add (new JISCharacter (cp, jis));
1435 void ParseCJK (string zhXML, string jaXML, string koXML)
1437 XmlDocument doc = new XmlDocument ();
1438 doc.XmlResolver = null;
1445 // Chinese Simplified
1448 offset = 0;//char.MaxValue - arr.Length;
1450 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1452 foreach (char c in s) {
1454 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1456 arr [(int) c - offset] = (ushort) v++;
1462 // Chinese Traditional
1465 offset = 0;//char.MaxValue - arr.Length;
1466 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1468 foreach (char c in s) {
1470 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1472 arr [(int) c - offset] = (ushort) v++;
1481 offset = 0;//char.MaxValue - arr.Length;
1484 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1485 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1486 arr [0x337E] = 0x8005;
1487 arr [0x337D] = 0x8006;
1488 arr [0x337C] = 0x8007;
1491 foreach (JISCharacter jc in jisJapanese) {
1492 if (jc.JIS < 0x8800)
1494 char c = (char) jc.CP;
1497 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1500 arr [(int) c - offset] = (ushort) v++;
1505 if (c == '\u662D') // U+337C
1507 if (c == '\u5927') // U+337D
1509 if (c == '\u5E73') // U+337B
1511 if (c == '\u660E') // U+337E
1513 if (c == '\u9686') // U+F9DC
1516 // FIXME: there are still remaining
1517 // characters after U+FA0C.
1518 // for (int k = 0; k < char.MaxValue; k++) {
1519 for (int k = 0; k < '\uFA0D'; k++) {
1520 if (decompIndex [k] == 0 || IsIgnorable (k))
1522 if (decompValues [decompIndex [k]] == c /*&&
1523 decompLength [k] == 1*/ ||
1524 decompLength [k] == 3 &&
1525 decompValues [decompIndex [k] + 1] == c) {
1526 arr [k - offset] = (ushort) v++;
1535 // Korean weight is somewhat complex. It first shifts
1536 // Hangul category from 52-x to 80-x (they are anyways
1537 // computed). CJK ideographs are placed at secondary
1538 // weight, like XX YY 01 zz 01, where XX and YY are
1539 // corresponding "reset" value and zz is 41,43,45...
1541 // Unlike chs,cht and ja, Korean value is a combined
1542 // ushort which is computed as category
1546 offset = 0;//char.MaxValue - arr.Length;
1548 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1549 XmlElement sc = (XmlElement) reset.NextSibling;
1550 // compute "category" and "level 1" for the
1551 // target "reset" Hangle syllable
1552 char rc = reset.InnerText [0];
1553 int ri = ((int) rc - 0xAC00) + 1;
1555 ((ri / 254) * 256 + (ri % 254) + 2);
1556 // Place the characters after the target.
1559 foreach (char c in s) {
1560 arr [(int) c - offset] = p;
1561 cjkKOlv2 [(int) c - offset] = (byte) v;
1571 void FillIgnorables ()
1573 for (int i = 0; i <= char.MaxValue; i++) {
1574 if (Char.GetUnicodeCategory ((char) i) ==
1575 UnicodeCategory.OtherNotAssigned)
1577 if (IsIgnorable (i))
1578 ignorableFlags [i] |= 1;
1579 if (IsIgnorableSymbol (i))
1580 ignorableFlags [i] |= 2;
1581 if (IsIgnorableNonSpacing (i))
1582 ignorableFlags [i] |= 4;
1586 void ModifyUnidata ()
1588 // Modify some decomposition equivalence
1589 decompType [0xFE31] = 0;
1590 decompIndex [0xFE31] = 0;
1591 decompLength [0xFE31] = 0;
1592 decompType [0xFE32] = 0;
1593 decompIndex [0xFE32] = 0;
1594 decompLength [0xFE32] = 0;
1596 // Korean parens numbers
1597 for (int i = 0x3200; i <= 0x321C; i++)
1598 diacritical [i] = 0xA;
1599 for (int i = 0x3260; i <= 0x327B; i++)
1600 diacritical [i] = 0xC;
1602 // LAMESPEC: these remapping should not be done.
1603 // Windows have incorrect CJK compat mappings.
1604 decompValues [decompIndex [0x32A9]] = 0x91AB;
1605 decompLength [0x323B] = 1;
1606 decompValues [decompIndex [0x323B]] = 0x5B78;
1607 decompValues [decompIndex [0x32AB]] = 0x5B78;
1608 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1609 decompLength [0x3238] = 1;
1610 decompValues [decompIndex [0x3238]] = 0x52DE;
1611 decompValues [decompIndex [0x3298]] = 0x52DE;
1613 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1614 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1615 decompValues [decompIndex [0xFA0C]] = 0x5140;
1616 decompLength [0xFA0C] = 1;
1617 decompIndex [0xF929] = decompLength [0xF929] = 0;
1619 decompValues [decompIndex [0xF92C]] = 0x90DE;
1622 void ModifyParsedValues ()
1624 // some cyrillic diacritical weight. They seem to be
1625 // based on old character names, so it's quicker to
1626 // set them directly here.
1627 diacritical [0x0496] = diacritical [0x0497] = 7;
1628 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1629 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1630 diacritical [0x049C] = diacritical [0x049D] = 9;
1631 diacritical [0x049E] = diacritical [0x049F] = 4;
1632 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1633 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1634 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1636 // number, secondary weights
1638 int [] numarr = numberSecondaryWeightBounds;
1639 for (int i = 0; i < numarr.Length; i += 2, weight++)
1640 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1641 if (Char.IsNumber ((char) cp))
1642 diacritical [cp] = weight;
1644 // Update name part of named characters
1645 for (int i = 0; i < sortableCharNames.Count; i++) {
1646 DictionaryEntry de =
1647 (DictionaryEntry) sortableCharNames [i];
1648 int cp = (int) de.Key;
1649 string renamed = null;
1651 case 0x2101: renamed = "A_1"; break;
1652 case 0x33C3: renamed = "A_2"; break;
1653 case 0x2105: renamed = "C_1"; break;
1654 case 0x2106: renamed = "C_2"; break;
1655 case 0x211E: renamed = "R1"; break;
1656 case 0x211F: renamed = "R2"; break;
1657 // Remove some of them!
1668 sortableCharNames.RemoveAt (i);
1672 if (renamed != null)
1673 sortableCharNames [i] =
1674 new DictionaryEntry (cp, renamed);
1678 void GenerateCore ()
1682 #region Specially ignored // 01
1683 // This will raise "Defined" flag up.
1684 // FIXME: Check If it is really fine. Actually for
1685 // Japanese voice marks this code does remapping.
1686 foreach (char c in specialIgnore)
1687 map [(int) c] = new CharMapEntry (0, 0, 0);
1690 #region Extenders (FF FF)
1691 fillIndex [0xFF] = 0xFF;
1692 char [] specialBiggest = new char [] {
1693 '\u3005', '\u3031', '\u3032', '\u309D',
1694 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1695 '\uFE7C', '\uFE7D', '\uFF70'};
1696 foreach (char c in specialBiggest)
1697 AddCharMap (c, 0xFF, 0);
1700 #region Variable weights
1701 // Controls : 06 03 - 06 3D
1702 fillIndex [0x6] = 3;
1703 for (int i = 0; i < 65536; i++) {
1704 if (IsIgnorable (i))
1707 uc = Char.GetUnicodeCategory (c);
1708 // NEL is whitespace but not ignored here.
1709 if (uc == UnicodeCategory.Control &&
1710 !Char.IsWhiteSpace (c) || c == '\u0085')
1711 AddCharMap (c, 6, 1);
1715 fillIndex [0x6] = 0x80;
1716 AddCharMap ('\'', 6, 0);
1717 AddCharMap ('\uFF07', 6, 1);
1718 AddCharMap ('\uFE63', 6, 1);
1720 // SPECIAL CASE: fill FE32 here in prior to be added
1721 // at 2013. Windows does not always respect NFKD.
1722 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1724 // Hyphen/Dash : 06 81 - 06 90
1725 for (int i = 0; i < char.MaxValue; i++) {
1726 if (!IsIgnorable (i) &&
1727 Char.GetUnicodeCategory ((char) i) ==
1728 UnicodeCategory.DashPunctuation) {
1729 AddCharMapGroup2 ((char) i, 6, 1, 0);
1731 // SPECIAL: add 2027 and 2043
1732 // Maybe they are regarded the
1733 // same hyphens in "central"
1735 AddCharMap ('\u2027', 6, 1);
1736 AddCharMap ('\u2043', 6, 1);
1740 // They are regarded as primarily equivalent to '-'
1741 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1742 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1743 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1745 // Arabic variable weight chars 06 A0 -
1746 fillIndex [6] = 0xA0;
1748 for (int i = 0x64B; i <= 0x650; i++)
1749 AddArabicCharMap ((char) i);
1751 AddCharMapGroup ('\u0652', 6, 1, 0);
1753 AddCharMapGroup ('\u0651', 6, 1, 0);
1757 #region Nonspacing marks // 01
1758 // FIXME: 01 03 - 01 B6 ... annoyance :(
1760 // Combining diacritical marks: 01 DC -
1762 fillIndex [0x1] = 0x41;
1763 for (int i = 0x030E; i <= 0x0326; i++)
1764 if (!IsIgnorable (i))
1765 AddCharMap ((char) i, 0x1, 1);
1766 for (int i = 0x0329; i <= 0x0334; i++)
1767 if (!IsIgnorable (i))
1768 AddCharMap ((char) i, 0x1, 1);
1770 for (int i = 0x0339; i <= 0x0341; i++)
1771 if (!IsIgnorable (i))
1772 AddCharMap ((char) i, 0x1, 1);
1773 fillIndex [0x1] = 0x74;
1774 for (int i = 0x0346; i <= 0x0348; i++)
1775 if (!IsIgnorable (i))
1776 AddCharMap ((char) i, 0x1, 1);
1777 for (int i = 0x02BE; i <= 0x02BF; i++)
1778 if (!IsIgnorable (i))
1779 AddCharMap ((char) i, 0x1, 1);
1780 for (int i = 0x02C1; i <= 0x02C5; i++)
1781 if (!IsIgnorable (i))
1782 AddCharMap ((char) i, 0x1, 1);
1783 for (int i = 0x02CE; i <= 0x02CF; i++)
1784 if (!IsIgnorable (i))
1785 AddCharMap ((char) i, 0x1, 1);
1787 for (int i = 0x02D1; i <= 0x02D3; i++)
1788 if (!IsIgnorable (i))
1789 AddCharMap ((char) i, 0x1, 1);
1790 AddCharMap ('\u02DE', 0x1, 1);
1791 for (int i = 0x02E4; i <= 0x02E9; i++)
1792 if (!IsIgnorable (i))
1793 AddCharMap ((char) i, 0x1, 1);
1795 // FIXME: needs more love here (it should eliminate
1796 // all the hacky code above).
1797 for (int i = 0x0300; i < 0x0370; i++)
1798 if (!IsIgnorable (i) && diacritical [i] != 0
1799 /* especiall here*/ && !map [i].Defined)
1800 map [i] = new CharMapEntry (
1801 0x1, 0x1, diacritical [i]);
1803 // Cyrillic and Armenian nonspacing mark
1804 fillIndex [0x1] = 0x94;
1805 for (int i = 0x400; i < 0x580; i++)
1806 if (!IsIgnorable (i) &&
1807 Char.GetUnicodeCategory ((char) i) ==
1808 UnicodeCategory.NonSpacingMark)
1809 AddCharMap ((char) i, 1, 1);
1811 fillIndex [0x1] = 0x8D;
1812 // syriac dotted nonspacing marks (1)
1813 AddCharMap ('\u0740', 0x1, 1);
1814 AddCharMap ('\u0741', 0x1, 1);
1815 AddCharMap ('\u0742', 0x1, 1);
1816 // syriac oblique nonspacing marks
1817 AddCharMap ('\u0747', 0x1, 1);
1818 AddCharMap ('\u0748', 0x1, 1);
1819 // syriac dotted nonspacing marks (2)
1820 fillIndex [0x1] = 0x94; // this reset is mandatory
1821 AddCharMap ('\u0732', 0x1, 1);
1822 AddCharMap ('\u0735', 0x1, 1);
1823 AddCharMap ('\u0738', 0x1, 1);
1824 AddCharMap ('\u0739', 0x1, 1);
1825 AddCharMap ('\u073C', 0x1, 1);
1826 // SPECIAL CASES: superscripts
1827 AddCharMap ('\u073F', 0x1, 1);
1828 AddCharMap ('\u0711', 0x1, 1);
1830 for (int i = 0x0743; i <= 0x0746; i++)
1831 AddCharMap ((char) i, 0x1, 1);
1832 for (int i = 0x0730; i <= 0x0780; i++)
1833 if (!map [i].Defined &&
1834 Char.GetUnicodeCategory ((char) i) ==
1835 UnicodeCategory.NonSpacingMark)
1836 AddCharMap ((char) i, 0x1, 1);
1838 // LAMESPEC: It should not stop at '\u20E1'. There are
1839 // a few more characters (that however results in
1840 // overflow of level 2 unless we start before 0xDD).
1841 fillIndex [0x1] = 0xDD;
1842 for (int i = 0x20D0; i <= 0x20DC; i++)
1843 AddCharMap ((char) i, 0x1, 1);
1844 fillIndex [0x1] = 0xEC;
1845 for (int i = 0x20DD; i <= 0x20E1; i++)
1846 AddCharMap ((char) i, 0x1, 1);
1847 fillIndex [0x1] = 0x7;
1848 for (int i = 0x302A; i <= 0x302D; i++)
1849 AddCharMap ((char) i, 0x1, 1);
1850 fillIndex [0x1] = 0x50; // I wonder how they are sorted
1851 for (int i = 0x02D4; i <= 0x02D7; i++)
1852 AddCharMap ((char) i, 0x1, 1);
1854 // They are not part of Nonspacing marks, but have
1855 // only diacritical weight.
1856 for (int i = 0x3099; i <= 0x309C; i++)
1857 map [i] = new CharMapEntry (1, 1, 1);
1858 map [0xFF9E] = new CharMapEntry (1, 1, 1);
1859 map [0xFF9F] = new CharMapEntry (1, 1, 2);
1860 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1861 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1862 for (int i = 0x30FC; i <= 0x30FE; i++)
1863 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1868 #region Whitespaces // 07 03 -
1869 fillIndex [0x7] = 0x2;
1870 AddCharMap (' ', 0x7, 2);
1871 AddCharMap ('\u00A0', 0x7, 1);
1872 for (int i = 9; i <= 0xD; i++)
1873 AddCharMap ((char) i, 0x7, 1);
1874 for (int i = 0x2000; i <= 0x200B; i++)
1875 AddCharMap ((char) i, 0x7, 1);
1877 fillIndex [0x7] = 0x17;
1878 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1879 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1881 // Characters which used to represent layout control.
1882 // LAMESPEC: Windows developers seem to have thought
1883 // that those characters are kind of whitespaces,
1884 // while they aren't.
1885 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1886 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1889 // category 09 - continued symbols from 08
1890 fillIndex [0x9] = 2;
1892 for (int cp = 0x2300; cp <= 0x237A; cp++)
1893 AddCharMap ((char) cp, 0x9, 1, 0);
1896 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
1897 foreach (DictionaryEntry de in arrowValues) {
1898 int idx = (int) de.Value;
1899 int cp = (int) de.Key;
1900 if (map [cp].Defined)
1902 fillIndex [0x9] = (byte) (0xD8 + idx);
1903 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1907 byte [] boxLv2 = new byte [128];
1908 for (int i = 0; i < boxLv2.Length; i++)
1910 foreach (DictionaryEntry de in boxValues) {
1911 int cp = (int) de.Key;
1912 int off = (int) de.Value;
1913 if (map [cp].Defined)
1916 fillIndex [0x9] = (byte) (0xE5 + off);
1917 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1920 fillIndex [0x9] = (byte) (0xE5 + off);
1921 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1924 // Some special characters (slanted)
1925 fillIndex [0x9] = 0xF4;
1926 AddCharMap ('\u2571', 0x9, 3);
1927 AddCharMap ('\u2572', 0x9, 3);
1928 AddCharMap ('\u2573', 0x9, 3);
1930 // FIXME: implement 0A
1932 fillIndex [0xA] = 2;
1933 // byte currency symbols
1934 for (int cp = 0; cp < 0x100; cp++) {
1935 uc = Char.GetUnicodeCategory ((char) cp);
1936 if (!IsIgnorable (cp) &&
1937 uc == UnicodeCategory.CurrencySymbol &&
1940 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1942 // byte other symbols
1943 for (int cp = 0; cp < 0x100; cp++) {
1945 continue; // SPECIAL: skip FIXME: why?
1946 uc = Char.GetUnicodeCategory ((char) cp);
1947 if (!IsIgnorable (cp) &&
1948 uc == UnicodeCategory.OtherSymbol ||
1949 cp == '\u00B5' || cp == '\u00B7')
1950 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1953 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1955 for (int cp = 0x2020; cp <= 0x2031; cp++)
1956 if (Char.IsPunctuation ((char) cp))
1957 AddCharMap ((char) cp, 0xA, 1, 0);
1958 // SPECIAL CASES: why?
1959 AddCharMap ('\u203B', 0xA, 1, 0);
1960 AddCharMap ('\u2040', 0xA, 1, 0);
1961 AddCharMap ('\u2041', 0xA, 1, 0);
1962 AddCharMap ('\u2042', 0xA, 1, 0);
1964 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1965 AddCharMap ((char) cp, 0xA, 1, 0);
1966 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1967 for (int cp = 0x2600; cp <= 0x2613; cp++)
1968 AddCharMap ((char) cp, 0xA, 1, 0);
1970 for (int cp = 0x2620; cp <= 0x2770; cp++)
1971 if (Char.IsSymbol ((char) cp))
1972 AddCharMap ((char) cp, 0xA, 1, 0);
1974 for (int i = 0x2440; i < 0x2460; i++)
1975 AddCharMap ((char) i, 0xA, 1, 0);
1979 #region Numbers // 0C 02 - 0C E1
1980 fillIndex [0xC] = 2;
1982 // 9F8 : Bengali "one less than the denominator"
1983 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
1985 ArrayList numbers = new ArrayList ();
1986 for (int i = 0; i < 65536; i++)
1987 if (!IsIgnorable (i) &&
1988 Char.IsNumber ((char) i) &&
1989 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1992 ArrayList numberValues = new ArrayList ();
1993 foreach (int i in numbers)
1994 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1995 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1997 //foreach (DictionaryEntry de in numberValues)
1998 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2000 decimal prevValue = -1;
2001 foreach (DictionaryEntry de in numberValues) {
2002 int cp = (int) de.Key;
2003 decimal currValue = (decimal) de.Value;
2004 bool addnew = false;
2005 if (prevValue < currValue &&
2006 prevValue - (int) prevValue == 0 &&
2010 // Process Hangzhou and Roman numbers
2012 // There are some SPECIAL cases.
2013 if (currValue != 4) // no increment for 4
2017 if (currValue <= 10) {
2018 xcp = (int) prevValue + 0x2170 - 1;
2019 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2020 xcp = (int) prevValue + 0x2160 - 1;
2021 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2022 fillIndex [0xC] += 2;
2023 xcp = (int) prevValue + 0x3021 - 1;
2024 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2027 else if (currValue == 11)
2030 if (prevValue < currValue)
2031 prevValue = currValue;
2032 if (map [cp].Defined)
2034 // HangZhou and Roman are add later
2036 else if (0x3021 <= cp && cp < 0x302A
2037 || 0x2160 <= cp && cp < 0x216A
2038 || 0x2170 <= cp && cp < 0x217A)
2041 if (cp == 0x215B) // FIXME: why?
2042 fillIndex [0xC] += 2;
2043 else if (cp == 0x3021) // FIXME: why?
2045 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2046 if (addnew || cp <= '9') {
2047 int mod = (int) currValue - 1;
2049 if (1 <= currValue && currValue <= 10) {
2051 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2053 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2055 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2057 if (1 <= currValue && currValue <= 20) {
2059 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2061 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2063 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2067 if (cp != 0x09E7 && cp != 0x09EA)
2070 // Add special cases that are not regarded as
2071 // numbers in UnicodeCategory speak.
2074 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2075 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2077 else if (cp == '6') // FIXME: why?
2082 fillIndex [0xC] = 0xFF;
2083 AddCharMap ('\u221E', 0xC, 1);
2086 #region Letters and NonSpacing Marks (general)
2088 // ASCII Latin alphabets
2089 for (int i = 0; i < alphabets.Length; i++)
2090 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2092 // non-ASCII Latin alphabets
2093 // FIXME: there is no such characters that are placed
2094 // *after* "alphabets" array items. This is nothing
2095 // more than a hack that creates dummy weight for
2096 // primary characters.
2097 for (int i = 0x0080; i < 0x0300; i++) {
2098 if (!Char.IsLetter ((char) i))
2100 // For those Latin Letters which has NFKD are
2101 // not added as independent primary character.
2102 if (decompIndex [i] != 0)
2105 // 1.some alphabets have primarily
2106 // equivalent ASCII alphabets.
2107 // 2.some have independent primary weights,
2108 // but inside a-to-z range.
2109 // 3.there are some expanded characters that
2110 // are not part of Unicode Standard NFKD.
2111 // 4. some characters are letter in IsLetter
2112 // but not in sortkeys (maybe unicode version
2113 // difference caused it).
2115 // 1. skipping them does not make sense
2116 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2117 // case 0x184: case 0x185: case 0x186: case 0x189:
2118 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2119 // case 0x194: case 0x195: case 0x196: case 0x19A:
2120 // case 0x19B: case 0x19C:
2121 // 2. skipping them does not make sense
2122 // case 0x14A: // Ng
2123 // case 0x14B: // ng
2127 case 0xDE: // Icelandic Thorn
2128 case 0xFE: // Icelandic Thorn
2129 case 0xDF: // German ss
2130 case 0xFF: // German ss
2132 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2133 // not classified yet
2134 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2135 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2136 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2140 AddCharMapGroup ((char) i, 0xE, 1, 0);
2144 fillIndex [0xF] = 02;
2145 for (int i = 0x0380; i < 0x0390; i++)
2146 if (Char.IsLetter ((char) i))
2147 AddLetterMap ((char) i, 0xF, 1);
2148 fillIndex [0xF] = 02;
2149 for (int i = 0x0391; i < 0x03CF; i++)
2150 if (Char.IsLetter ((char) i))
2151 AddLetterMap ((char) i, 0xF, 1);
2152 fillIndex [0xF] = 0x40;
2153 for (int i = 0x03D0; i < 0x0400; i++)
2154 if (Char.IsLetter ((char) i))
2155 AddLetterMap ((char) i, 0xF, 1);
2158 // Cyrillic letters are sorted like Latin letters i.e.
2159 // containing culture-specific letters between the
2160 // standard Cyrillic sequence.
2162 // We can't use UCA here; it has different sorting.
2163 char [] orderedCyrillic = new char [] {
2164 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2165 '\u0452', // DJE for Serbocroatian
2167 '\u0454', // IE for Ukrainian
2171 '\u0456', // Byelorussian-Ukrainian I
2181 '\u043F', '\u0440', '\u0441', '\u0442',
2182 '\u045B', // TSHE for Serbocroatian
2184 '\u045E', // Short U for Byelorussian
2185 '\u04B1', // Straight U w/ stroke (diacritical!)
2186 '\u0444', '\u0445', '\u0446', '\u0447',
2188 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2189 '\u044D', '\u044E', '\u044F'};
2191 // For some characters here is a map to basic cyrillic
2192 // letters. See UnicodeData.txt character names for
2193 // the sources. Here I simply declare an equiv. array.
2194 // The content characters are map from U+490(,491),
2195 // skipping small letters.
2196 char [] cymap_src = new char [] {
2197 '\u0433', '\u0433', '\u0433', '\u0436',
2198 '\u0437', '\u043A', '\u043A', '\u043A',
2199 '\u043A', '\u043D', '\u043D', '\u043F',
2200 '\u0445', '\u0441', '\u0442', '\u0443',
2201 '\u0443', '\u0445', '\u0446', '\u0447',
2202 '\u0447', '\u0432', '\u0435', '\u0435',
2203 '\u0406', '\u0436', '\u043A', '\u043D',
2204 '\u0447', '\u0435'};
2206 fillIndex [0x10] = 0x8D;
2207 for (int i = 0x0460; i < 0x0481; i++) {
2208 if (Char.IsLetter ((char) i)) {
2210 // U+476/477 have the same
2211 // primary weight as U+474/475.
2212 fillIndex [0x10] -= 3;
2213 AddLetterMap ((char) i, 0x10, 3);
2217 fillIndex [0x10] = 0x6;
2218 for (int i = 0; i < orderedCyrillic.Length; i++) {
2219 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2220 if (!IsIgnorable ((int) c) &&
2221 Char.IsLetter (c) &&
2223 AddLetterMap (c, 0x10, 0);
2224 fillIndex [0x10] += 3;
2228 for (int i = 0; i < cymap_src.Length; i++) {
2229 char c = cymap_src [i];
2230 fillIndex [0x10] = map [c].Level1;
2231 int c2 = 0x0490 + i * 2;
2232 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2236 fillIndex [0x11] = 0x3;
2237 fillIndex [0x1] = 0x98;
2238 for (int i = 0x0531; i < 0x0586; i++) {
2239 if (i == 0x0559 || i == 0x55A)
2240 AddCharMap ((char) i, 1, 1);
2241 if (Char.IsLetter ((char) i))
2242 AddLetterMap ((char) i, 0x11, 1);
2247 fillIndex [0x12] = 0x2;
2248 for (int i = 0x05D0; i < 0x05FF; i++)
2249 if (Char.IsLetter ((char) i))
2250 AddLetterMap ((char) i, 0x12, 1);
2252 fillIndex [0x1] = 0x3;
2253 for (int i = 0x0591; i <= 0x05C2; i++) {
2254 if (i == 0x05A3 || i == 0x05BB)
2257 AddCharMap ((char) i, 0x1, 1);
2261 fillIndex [0x1] = 0x8E;
2262 fillIndex [0x13] = 0x3;
2263 for (int i = 0x0621; i <= 0x064A; i++) {
2265 if (Char.GetUnicodeCategory ((char) i)
2266 != UnicodeCategory.OtherLetter) {
2267 // FIXME: arabic nonspacing marks are
2268 // in different order.
2269 AddCharMap ((char) i, 0x1, 1);
2272 // map [i] = new CharMapEntry (0x13,
2273 // (byte) arabicLetterPrimaryValues [i], 1);
2275 (byte) arabicLetterPrimaryValues [i];
2276 byte formDiacritical = 8; // default
2279 case 0x0622: formDiacritical = 9; break;
2280 case 0x0623: formDiacritical = 0xA; break;
2281 case 0x0624: formDiacritical = 5; break;
2282 case 0x0625: formDiacritical = 0xB; break;
2283 case 0x0626: formDiacritical = 7; break;
2284 case 0x0649: formDiacritical = 5; break;
2285 case 0x064A: formDiacritical = 7; break;
2287 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2289 for (int i = 0x0670; i < 0x0673; i++)
2290 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2291 fillIndex [0x13] = 0x84;
2292 for (int i = 0x0674; i < 0x06D6; i++)
2293 if (Char.IsLetter ((char) i))
2294 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2298 // FIXME: this could be fixed in more decent way
2299 for (int i = 0x0958; i <= 0x095F; i++)
2300 diacritical [i] = 8;
2302 // FIXME: it does seem straight codepoint mapping.
2303 fillIndex [0x14] = 04;
2304 for (int i = 0x0901; i < 0x0905; i++)
2305 if (!IsIgnorable (i))
2306 AddLetterMap ((char) i, 0x14, 2);
2307 fillIndex [0x14] = 0xB;
2308 for (int i = 0x0905; i < 0x093A; i++) {
2310 AddCharMap ('\u0929', 0x14, 0, 8);
2312 AddCharMap ('\u0931', 0x14, 0, 8);
2314 AddCharMap ('\u0934', 0x14, 0, 8);
2315 if (Char.IsLetter ((char) i))
2316 AddLetterMap ((char) i, 0x14, 4);
2318 AddCharMap ('\u0960', 0x14, 4);
2320 AddCharMap ('\u0961', 0x14, 4);
2322 fillIndex [0x14] = 0xDA;
2323 for (int i = 0x093E; i < 0x0945; i++)
2324 if (!IsIgnorable (i))
2325 AddLetterMap ((char) i, 0x14, 2);
2326 fillIndex [0x14] = 0xEC;
2327 for (int i = 0x0945; i < 0x094F; i++)
2328 if (!IsIgnorable (i))
2329 AddLetterMap ((char) i, 0x14, 2);
2333 fillIndex [0x15] = 02;
2334 for (int i = 0x0980; i < 0x9FF; i++) {
2335 if (IsIgnorable (i))
2338 fillIndex [0x15] = 0x3B;
2339 switch (Char.GetUnicodeCategory ((char) i)) {
2340 case UnicodeCategory.NonSpacingMark:
2341 case UnicodeCategory.DecimalDigitNumber:
2342 case UnicodeCategory.OtherNumber:
2345 AddLetterMap ((char) i, 0x15, 1);
2348 fillIndex [0x1] = 0x3;
2349 for (int i = 0x0981; i < 0x0A00; i++)
2350 if (Char.GetUnicodeCategory ((char) i) ==
2351 UnicodeCategory.NonSpacingMark)
2352 AddCharMap ((char) i, 0x1, 1);
2354 // Gurmukhi. orderedGurmukhi is from UCA
2355 // FIXME: it does not look equivalent to UCA.
2356 fillIndex [0x16] = 04;
2357 fillIndex [0x1] = 3;
2358 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2359 char c = orderedGurmukhi [i];
2360 if (IsIgnorable ((int) c))
2362 if (IsIgnorableNonSpacing (c)) {
2363 AddLetterMap (c, 0x1, 1);
2366 if (c == '\u0A3C' || c == '\u0A4D' ||
2367 '\u0A66' <= c && c <= '\u0A71')
2372 case '\u0A33': case '\u0A36': case '\u0A16':
2373 case '\u0A17': case '\u0A5B': case '\u0A5E':
2377 if (c == '\u0A3E') // Skip
2378 fillIndex [0x16] = 0xC0;
2379 AddLetterMap (c, 0x16, shift);
2382 // Gujarati. orderedGujarati is from UCA
2383 fillIndex [0x17] = 0x4;
2385 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2386 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2387 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2388 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2389 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2390 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2391 // letters go first.
2392 for (int i = 0; i < orderedGujarati.Length; i++) {
2394 char c = orderedGujarati [i];
2395 if (Char.IsLetter (c)) {
2397 if (c == '\u0AB3' || c == '\u0A32')
2399 if (c == '\u0A33') {
2400 AddCharMap ('\u0A32', 0x17, 0);
2401 AddCharMap ('\u0A33', 0x17, 4, 4);
2405 AddCharMap ('\u0AE0', 0x17, 0, 5);
2406 AddCharMap (c, 0x17, 4);
2409 AddCharMap ('\u0AB3', 0x17, 6);
2413 byte gujaratiShift = 4;
2414 fillIndex [0x17] = 0xC0;
2415 for (int i = 0; i < orderedGujarati.Length; i++) {
2416 char c = orderedGujarati [i];
2417 if (fillIndex [0x17] == 0xCC)
2419 if (!Char.IsLetter (c)) {
2422 AddCharMap ('\u0A81', 0x17, 2);
2425 AddLetterMap (c, 0x17, gujaratiShift);
2430 fillIndex [0x1] = 03;
2431 fillIndex [0x18] = 02;
2432 for (int i = 0x0B00; i < 0x0B7F; i++) {
2433 switch (Char.GetUnicodeCategory ((char) i)) {
2434 case UnicodeCategory.NonSpacingMark:
2435 case UnicodeCategory.DecimalDigitNumber:
2436 AddLetterMap ((char) i, 0x1, 1);
2439 AddLetterMap ((char) i, 0x18, 1);
2443 fillIndex [0x19] = 2;
2444 AddCharMap ('\u0BD7', 0x19, 0);
2445 fillIndex [0x19] = 0xA;
2447 for (int i = 0x0B82; i <= 0x0B94; i++)
2448 if (!IsIgnorable ((char) i))
2449 AddCharMap ((char) i, 0x19, 2);
2451 fillIndex [0x19] = 0x28;
2452 // The array for Tamil consonants is a constant.
2453 // Windows have almost similar sequence to TAM from
2454 // tamilnet but a bit different in Grantha.
2455 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2456 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2458 fillIndex [0x19] = 0x82;
2459 for (int i = 0x0BBE; i < 0x0BCD; i++)
2460 if (Char.GetUnicodeCategory ((char) i) ==
2461 UnicodeCategory.SpacingCombiningMark
2463 AddLetterMap ((char) i, 0x19, 2);
2466 fillIndex [0x1A] = 0x4;
2467 for (int i = 0x0C00; i < 0x0C62; i++) {
2468 if (i == 0x0C55 || i == 0x0C56)
2470 AddCharMap ((char) i, 0x1A, 3);
2471 char supp = (i == 0x0C0B) ? '\u0C60':
2472 i == 0x0C0C ? '\u0C61' : char.MinValue;
2473 if (supp == char.MinValue)
2475 AddCharMap (supp, 0x1A, 3);
2479 fillIndex [0x1B] = 4;
2480 for (int i = 0x0C80; i < 0x0CE5; i++) {
2481 if (i == 0x0CD5 || i == 0x0CD6)
2483 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2484 continue; // shift after 0xCB9
2485 AddCharMap ((char) i, 0x1B, 3);
2487 // SPECIAL CASES: but why?
2488 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2489 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2490 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2493 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2497 fillIndex [0x1C] = 2;
2498 fillIndex [0x1] = 3;
2499 for (int i = 0x0D02; i < 0x0D61; i++) {
2500 // FIXME: I avoided MSCompatUnicodeTable usage
2501 // here (it results in recursion). So check if
2502 // using NonSpacingMark makes sense or not.
2503 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2504 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2505 AddCharMap ((char) i, 0x1C, 1);
2506 else if (!IsIgnorable ((char) i))
2507 AddCharMap ((char) i, 1, 1);
2510 // Thai ... note that it breaks 0x1E wall after E2B!
2511 // Also, all Thai characters have level 2 value 3.
2512 fillIndex [0x1E] = 2;
2513 fillIndex [0x1] = 3;
2514 for (int i = 0xE40; i <= 0xE44; i++)
2515 AddCharMap ((char) i, 0x1E, 1, 3);
2516 for (int i = 0xE01; i < 0xE2B; i++)
2517 AddCharMap ((char) i, 0x1E, 6, 3);
2518 fillIndex [0x1F] = 5;
2519 for (int i = 0xE2B; i < 0xE30; i++)
2520 AddCharMap ((char) i, 0x1F, 6, 3);
2521 fillIndex [0x1F] = 0x1E;
2522 for (int i = 0xE30; i < 0xE3B; i++)
2523 AddCharMap ((char) i, 0x1F, 1, 3);
2524 // some Thai characters remains.
2525 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2526 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2527 foreach (char c in specialThai)
2528 AddCharMap (c, 0x1F, 1, 3);
2530 for (int i = 0xE00; i < 0xE80; i++)
2531 if (Char.GetUnicodeCategory ((char) i) ==
2532 UnicodeCategory.NonSpacingMark)
2533 AddCharMap ((char) i, 1, 1);
2536 fillIndex [0x1F] = 2;
2537 fillIndex [0x1] = 3;
2538 for (int i = 0xE80; i < 0xEDF; i++) {
2539 if (IsIgnorable ((char) i))
2541 else if (Char.IsLetter ((char) i))
2542 AddCharMap ((char) i, 0x1F, 1);
2543 else if (Char.GetUnicodeCategory ((char) i) ==
2544 UnicodeCategory.NonSpacingMark)
2545 AddCharMap ((char) i, 1, 1);
2548 // Georgian. orderedGeorgian is from UCA DUCET.
2549 fillIndex [0x21] = 5;
2550 for (int i = 0; i < orderedGeorgian.Length; i++) {
2551 char c = orderedGeorgian [i];
2552 if (map [(int) c].Defined)
2554 AddCharMap (c, 0x21, 0);
2556 AddCharMap ((char) (c - 0x30), 0x21, 0);
2557 fillIndex [0x21] += 5;
2561 fillIndex [0x22] = 2;
2562 int kanaOffset = 0x3041;
2563 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2565 for (int gyo = 0; gyo < 9; gyo++) {
2566 for (int dan = 0; dan < 5; dan++) {
2567 if (gyo == 7 && dan % 2 == 1) {
2570 kanaOffset -= 2; // There is no space for yi and ye.
2573 int cp = kanaOffset + dan * kanaLines [gyo];
2574 // small lines (a-gyo, ya-gyo)
2575 if (gyo == 0 || gyo == 7) {
2576 AddKanaMap (cp, 1); // small
2577 AddKanaMap (cp + 1, 1);
2580 AddKanaMap (cp, kanaLines [gyo]);
2584 // add small 'ka' (before normal one)
2585 AddKanaMap (0x30F5, 1);
2589 // add small 'ke' (before normal one)
2590 AddKanaMap (0x30F6, 1);
2594 // add small 'Tsu' (before normal one)
2595 AddKanaMap (0x3063, 1);
2599 fillIndex [0x22] += 3;
2600 kanaOffset += 5 * kanaLines [gyo];
2603 // Wa-gyo is almost special, so I just manually add.
2604 AddLetterMap ((char) 0x308E, 0x22, 0);
2605 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2606 AddLetterMap ((char) 0x308F, 0x22, 0);
2607 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2609 AddLetterMap ((char) 0x3090, 0x22, 0);
2610 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2611 fillIndex [0x22] += 2;
2612 // no "Wu" in Japanese.
2613 AddLetterMap ((char) 0x3091, 0x22, 0);
2614 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2616 AddLetterMap ((char) 0x3092, 0x22, 0);
2617 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2619 fillIndex [0x22] = 0x80;
2620 AddLetterMap ((char) 0x3093, 0x22, 0);
2621 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2623 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2624 map [0x30A6].Level1, 3);// voiced hiragana U
2625 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2626 map [0x30A6].Level1, 3);// voiced katakana U
2628 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2629 map [0x30AB].Level1, 0);// small katakana Ka
2630 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2631 map [0x30B1].Level1, 0);// small katakana Ke
2633 for (int i = 0x30F7; i < 0x30FB; i++)
2634 map [i] = new CharMapEntry (map [i - 8].Category,
2638 // JIS Japanese square chars.
2639 fillIndex [0x22] = 0x97;
2640 jisJapanese.Sort (JISComparer.Instance);
2641 foreach (JISCharacter j in jisJapanese)
2642 if (0x3300 <= j.CP && j.CP <= 0x3357)
2643 AddCharMap ((char) j.CP, 0x22, 1);
2644 // non-JIS Japanese square chars.
2645 nonJisJapanese.Sort (NonJISComparer.Instance);
2646 foreach (NonJISCharacter j in nonJisJapanese)
2647 AddCharMap ((char) j.CP, 0x22, 1);
2650 fillIndex [0x23] = 0x02;
2651 for (int i = 0x3105; i <= 0x312C; i++)
2652 AddCharMap ((char) i, 0x23, 1);
2654 // Estrangela: ancient Syriac
2655 fillIndex [0x24] = 0x0B;
2656 // FIXME: is 0x71E really alternative form?
2657 ArrayList syriacAlternatives = new ArrayList (
2658 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2659 for (int i = 0x0710; i <= 0x072C; i++) {
2660 if (i == 0x0711) // NonSpacingMark
2662 if (syriacAlternatives.Contains (i))
2664 AddCharMap ((char) i, 0x24, 4);
2669 foreach (int cp in syriacAlternatives)
2670 map [cp] = new CharMapEntry (0x24,
2671 (byte) (map [cp - 1].Level1 + 2),
2673 // FIXME: Syriac NonSpacingMark should go here.
2676 // FIXME: it turned out that it does not look like UCA
2677 fillIndex [0x24] = 0x6E;
2678 fillIndex [0x1] = 0xAC;
2679 for (int i = 0; i < orderedThaana.Length; i++) {
2680 char c = orderedThaana [i];
2681 if (IsIgnorableNonSpacing ((int) c))
2682 AddCharMap (c, 1, 1);
2683 AddCharMap (c, 0x24, 2);
2684 if (c == '\u0782') // SPECIAL CASE: why?
2685 fillIndex [0x24] += 2;
2689 // FIXME: Add more culture-specific letters (that are
2690 // not supported in Windows collation) here.
2692 // Surrogate ... they are computed.
2697 // Unlike UCA Windows Hangul sequence mixes Jongseong
2698 // with Choseong sequence as well as Jungseong,
2699 // adjusted to have the same primary weight for the
2700 // same base character. So it is impossible to compute
2703 // Here I introduce an ordered sequence of mixed
2704 // 'commands' and 'characters' that is similar to
2706 // - ',' increases primary weight.
2707 // - [A B] means a range, increasing index
2708 // - {A B} means a range, without increasing index
2709 // - '=' is no operation (it means the characters
2710 // of both sides have the same weight).
2711 // - '>' inserts a Hangul Syllable block that
2712 // contains 0x251 characters.
2713 // - '<' decreases the index
2714 // - '0'-'9' means skip count
2715 // - whitespaces are ignored
2718 string hangulSequence =
2719 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2720 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2721 + "<{\u1113 \u1116}, \u3165,"
2722 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2723 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2724 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2725 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2726 + "[\u11D1 \u11D2], \u11B2,"
2727 + "[\u11D3 \u11D5], \u11B3,"
2728 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2729 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2730 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2731 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2732 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2733 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2734 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2735 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2736 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2737 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2738 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2739 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2740 + "\u11F1,, \u11F2,,,"
2741 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2742 + "<\u114D, \u110D,, >"
2743 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2744 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2745 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2746 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2747 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2751 byte hangulCat = 0x52;
2752 fillIndex [hangulCat] = 0x2;
2754 int syllableBlock = 0;
2755 for (int n = 0; n < hangulSequence.Length; n++) {
2756 char c = hangulSequence [n];
2758 if (Char.IsWhiteSpace (c))
2764 IncrementSequentialIndex (ref hangulCat);
2767 if (fillIndex [hangulCat] == 2)
2768 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2769 fillIndex [hangulCat]--;
2772 IncrementSequentialIndex (ref hangulCat);
2773 for (int l = 0; l < 0x15; l++)
2774 for (int v = 0; v < 0x1C; v++) {
2776 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2777 IncrementSequentialIndex (ref hangulCat);
2782 start = hangulSequence [n + 1];
2783 end = hangulSequence [n + 3];
2784 for (int i = start; i <= end; i++) {
2785 AddCharMap ((char) i, hangulCat, 0);
2787 IncrementSequentialIndex (ref hangulCat);
2789 n += 4; // consumes 5 characters for this operation
2792 start = hangulSequence [n + 1];
2793 end = hangulSequence [n + 3];
2794 for (int i = start; i <= end; i++)
2795 AddCharMap ((char) i, hangulCat, 0);
2796 n += 4; // consumes 5 characters for this operation
2799 AddCharMap (c, hangulCat, 0);
2805 for (int i = 0x3200; i < 0x3300; i++) {
2806 if (IsIgnorable (i) || map [i].Defined)
2810 if (decompLength [i] == 4 &&
2811 decompValues [decompIndex [i]] == '(')
2812 ch = decompIndex [i] + 1;
2814 else if (decompLength [i] == 2 &&
2815 decompValues [decompIndex [i] + 1] == '\u1161')
2816 ch = decompIndex [i];
2817 else if (decompLength [i] == 1)
2818 ch = decompIndex [i];
2821 ch = decompValues [ch];
2822 if (ch < 0x1100 || 0x1200 < ch &&
2823 ch < 0xAC00 || 0xD800 < ch)
2827 int offset = i < 0x3260 ? 1 : 0;
2828 if (0x326E <= i && i <= 0x3273)
2831 map [i] = new CharMapEntry (map [ch].Category,
2832 (byte) (map [ch].Level1 + offset),
2834 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2840 // Letterlike characters and CJK compatibility square
2841 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2842 int [] counts = new int ['Z' - 'A' + 1];
2843 char [] namedChars = new char [sortableCharNames.Count];
2845 foreach (DictionaryEntry de in sortableCharNames) {
2846 counts [((string) de.Value) [0] - 'A']++;
2847 namedChars [nCharNames++] = (char) ((int) de.Key);
2849 nCharNames = 0; // reset
2850 for (int a = 0; a < counts.Length; a++) {
2851 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2852 for (int i = 0; i < counts [a]; i++)
2853 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2854 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2857 // CJK unified ideograph.
2859 fillIndex [cjkCat] = 0x2;
2860 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2861 if (!IsIgnorable (cp))
2862 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2863 // CJK Extensions goes here.
2864 // LAMESPEC: With this Windows style CJK layout, it is
2865 // impossible to add more CJK ideograph i.e. 0x9FA6-
2866 // 0x9FBB can never be added w/o breaking compat.
2867 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2868 if (!IsIgnorable (cp))
2869 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2871 // PrivateUse ... computed.
2872 // remaining Surrogate ... computed.
2874 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2875 // non-alphanumeric ASCII except for: + - < = > '
2876 for (int i = 0x21; i < 0x7F; i++) {
2877 if (Char.IsLetterOrDigit ((char) i)
2878 || "+-<=>'".IndexOf ((char) i) >= 0)
2879 continue; // they are not added here.
2880 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2881 // Insert 3001 after ',' and 3002 after '.'
2883 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2885 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2887 AddCharMap ('\uFE30', 0x7, 1, 0);
2891 #region 07 - Punctuations and something else
2892 for (int i = 0xA0; i < char.MaxValue; i++) {
2893 if (IsIgnorable (i))
2896 // FIXME: actually those reset should not be
2897 // done but here I put for easy goal.
2899 fillIndex [0x7] = 0xE2;
2901 fillIndex [0x7] = 0x77;
2915 switch (Char.GetUnicodeCategory ((char) i)) {
2916 case UnicodeCategory.OtherPunctuation:
2917 case UnicodeCategory.ClosePunctuation:
2918 case UnicodeCategory.OpenPunctuation:
2919 case UnicodeCategory.ConnectorPunctuation:
2920 case UnicodeCategory.InitialQuotePunctuation:
2921 case UnicodeCategory.FinalQuotePunctuation:
2922 case UnicodeCategory.ModifierSymbol:
2923 // SPECIAL CASES: // 0xA
2924 if (0x2020 <= i && i <= 0x2031)
2926 AddCharMapGroup ((char) i, 0x7, 1, 0);
2929 if (i == 0xA6 || i == 0x1C3) // SPECIAL CASE. FIXME: why?
2930 goto case UnicodeCategory.OtherPunctuation;
2935 // FIXME: it should not need to reset level 1, but
2936 // it's for easy goal.
2937 fillIndex [0x7] = 0xB6;
2938 for (int i = 0x2400; i <= 0x2421; i++)
2939 AddCharMap ((char) i, 0x7, 1, 0);
2941 // Actually 3008-301F and FE33-FE5D are mixed, so
2942 // it's somewhat countable, but not as a whole. Thus
2943 // manual remapping is quicker.
2944 fillIndex [0x7] = 0x8D;
2945 int [] cjkCompatMarks1 = new int [] {
2946 0xFE33, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C};
2947 int [] cjkCompatMarks2 = new int [] {
2948 0xFE34, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41,
2949 0xFE42, 0xFE43, 0xFE44, 0xFE3B, 0xFE3C/*FE5D*/,
2950 0xFE39/*FE5E*/, 0xFE3A};
2951 for (int i = 0; i < cjkCompatMarks1.Length; i++)
2952 map [cjkCompatMarks1 [i]] = new CharMapEntry (
2953 0x7, fillIndex [0x7]++, 0);
2954 for (int i = 0; i < cjkCompatMarks2.Length; i++) {
2955 map [cjkCompatMarks2 [i]] = new CharMapEntry (
2956 0x7, fillIndex [0x7], 0);
2957 fillIndex [0x7] += 2;
2958 switch (cjkCompatMarks2 [i]) {
2960 map [0xFE5D] = new CharMapEntry (
2961 0x7, fillIndex [0x7]++, 0);
2964 map [0xFE5D] = new CharMapEntry (
2965 0x7, fillIndex [0x7]++, 0);
2970 fillIndex [0x7] = 0x93;
2971 for (int i = 0x3008; i <= 0x3011; i++) {
2972 map [i] = new CharMapEntry (0x7,
2973 fillIndex [0x7], 0);
2974 fillIndex [0x7] += 2;
2976 fillIndex [0x7] += 3;
2977 map [0x3014] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2978 fillIndex [0x7] += 3;
2979 map [0x3015] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2980 fillIndex [0x7] += 2;
2981 for (int i = 0x3016; i < 0x301F; i++)
2982 map [i] = new CharMapEntry (0x7,
2983 fillIndex [0x7]++, 0);
2987 // FIXME: for 07 xx we need more love.
2989 // Characters w/ diacritical marks (NFKD)
2990 for (int i = 0; i <= char.MaxValue; i++) {
2991 if (map [i].Defined || IsIgnorable (i))
2993 if (decompIndex [i] == 0)
2996 int start = decompIndex [i];
2997 int primaryChar = decompValues [start];
2998 int secondary = diacritical [i];
3000 int length = decompLength [i];
3001 // special processing for parenthesized ones.
3003 decompValues [start] == '(' &&
3004 decompValues [start + 2] == ')') {
3005 primaryChar = decompValues [start + 1];
3009 if (map [primaryChar].Level1 == 0)
3012 for (int l = 1; l < length; l++) {
3013 int c = decompValues [start + l];
3014 if (map [c].Level1 != 0)
3016 secondary += diacritical [c];
3020 map [i] = new CharMapEntry (
3021 map [primaryChar].Category,
3022 map [primaryChar].Level1,
3027 // category 08 - symbols
3028 fillIndex [0x8] = 2;
3029 // Here Windows mapping is not straightforward. It is
3030 // not based on computation but seems manual sorting.
3031 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3032 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3033 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3034 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3035 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3036 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3037 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3038 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3039 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3040 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3041 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3042 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3043 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3045 for (int cp = 0; cp < 0x2300; cp++) {
3046 if (cp == 0xAC) // SPECIAL CASE: skip
3049 cp = 0x2200; // skip to 2200
3050 fillIndex [0x8] = 0x21;
3053 fillIndex [0x8] = 0x3;
3055 fillIndex [0x8] = 0xB9;
3056 if (!map [cp].Defined &&
3057 // Char.GetUnicodeCategory ((char) cp) ==
3058 // UnicodeCategory.MathSymbol)
3059 Char.IsSymbol ((char) cp))
3060 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3061 // SPECIAL CASES: no idea why Windows sorts as such
3064 AddCharMap ('\u227B', 0x8, 1, 0);
3065 AddCharMap ('\u22B1', 0x8, 1, 0);
3068 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3069 AddCharMapGroup ('\u226A', 0x8, 1, 0);
3070 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3071 AddCharMapGroup ('\u226B', 0x8, 1, 0);
3074 AddCharMap ('\u01C0', 0x8, 1, 0);
3075 AddCharMap ('\u01C1', 0x8, 1, 0);
3076 AddCharMap ('\u01C2', 0x8, 1, 0);
3081 #region Level2 adjustment
3083 diacritical [0x624] = 0x5;
3084 diacritical [0x626] = 0x7;
3085 diacritical [0x622] = 0x9;
3086 diacritical [0x623] = 0xA;
3087 diacritical [0x625] = 0xB;
3088 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3089 diacritical [0x64A] = 0x7; // Yaa'
3091 for (int i = 0; i < char.MaxValue; i++) {
3093 byte cat = map [i].Category;
3095 case 0xE: // Latin diacritics
3096 case 0x22: // Japanese: circled characters
3097 mod = diacritical [i];
3099 case 0x13: // Arabic
3100 if (diacritical [i] == 0 && i >= 0xFE8D)
3101 mod = 0x8; // default for arabic
3104 if (0x52 <= cat && cat <= 0x7F) // Hangul
3105 mod = diacritical [i];
3107 map [i] = new CharMapEntry (
3108 cat, map [i].Level1, mod);
3112 // FIXME: this is halfly hack but those NonSpacingMark
3113 // characters and still undefined are likely to
3115 for (int i = 0; i < char.MaxValue; i++) {
3116 if (map [i].Defined ||
3125 if (Char.GetUnicodeCategory ((char) i) !=
3126 UnicodeCategory.NonSpacingMark)
3130 if (diacritical [i] != 0)
3131 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3133 AddCharMap ((char) i, 1, 1);
3136 // FIXME: this is hack but those Symbol characters
3137 // are likely to fall into 0xA category.
3138 for (int i = 0; i < char.MaxValue; i++)
3139 if (!map [i].Defined &&
3141 Char.IsSymbol ((char) i))
3142 AddCharMap ((char) i, 0xA, 1);
3145 private void IncrementSequentialIndex (ref byte hangulCat)
3147 fillIndex [hangulCat]++;
3148 if (fillIndex [hangulCat] == 0) { // overflown
3150 fillIndex [hangulCat] = 0x2;
3154 // Reset fillIndex to fixed value and call AddLetterMap().
3155 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3157 fillIndex [category] = alphaWeight;
3158 AddLetterMap (c, category, 0);
3160 ArrayList al = latinMap [c] as ArrayList;
3164 foreach (int cp in al)
3165 AddLetterMap ((char) cp, category, 0);
3168 private void AddKanaMap (int i, byte voices)
3170 for (byte b = 0; b < voices; b++) {
3171 char c = (char) (i + b);
3172 byte arg = (byte) (b > 0 ? b + 2 : 0);
3174 AddLetterMapCore (c, 0x22, 0, arg, false);
3176 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3180 private void AddLetterMap (char c, byte category, byte updateCount)
3182 AddLetterMapCore (c, category, updateCount, 0, true);
3185 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3188 // <small> updates index
3189 c2 = ToSmallForm (c);
3191 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3192 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3193 if (c2 != c && !map [(int) c2].Defined)
3194 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3195 bool doUpdate = true;
3196 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3199 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3201 fillIndex [category] += updateCount;
3204 private bool AddCharMap (char c, byte category, byte increment)
3206 return AddCharMap (c, category, increment, 0);
3209 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3211 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3212 return false; // do nothing
3213 map [(int) c] = new CharMapEntry (category,
3214 category == 1 ? alt : fillIndex [category],
3215 category == 1 ? fillIndex [category] : alt);
3216 fillIndex [category] += increment;
3221 // Adds characters to table in the order below
3222 // (+ increases weight):
3226 // <full> | <super> | <sub>
3227 // <circle> | <wide> (| <narrow>)
3231 // level2 is fixed (does not increase).
3232 int [] sameWeightItems = new int [] {
3233 DecompositionFraction,
3237 DecompositionCircle,
3239 DecompositionNarrow,
3241 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3243 AddCharMapGroup (c, category, updateCount, level2, false);
3246 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3248 if (map [(int) c].Defined)
3252 level2 = diacritical [(int) c];
3254 char small = char.MinValue;
3255 char vertical = char.MinValue;
3256 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3258 object smv = nfkd [(byte) DecompositionSmall];
3260 small = (char) ((int) smv);
3261 object vv = nfkd [(byte) DecompositionVertical];
3263 vertical = (char) ((int) vv);
3266 // <small> updates index
3267 if (small != char.MinValue) {
3268 if (level2 == 0 && deferLevel2)
3269 level2 = diacritical [small];
3270 AddCharMap (small, category, updateCount, level2);
3274 AddCharMap (c, category, 0, level2);
3277 foreach (int weight in sameWeightItems) {
3278 object wv = nfkd [(byte) weight];
3281 level2 = diacritical [(int) wv];
3282 AddCharMap ((char) ((int) wv), category, 0, level2);
3287 // update index here.
3288 fillIndex [category] += updateCount;
3290 if (vertical != char.MinValue) {
3291 if (level2 == 0 && deferLevel2)
3292 level2 = diacritical [vertical];
3293 AddCharMap (vertical, category, updateCount, level2);
3297 private void AddCharMapCJK (char c, ref byte category)
3299 AddCharMap (c, category, 0, 0);
3300 IncrementSequentialIndex (ref category);
3302 // Special. I wonder why but Windows skips 9E F9.
3303 if (category == 0x9E && fillIndex [category] == 0xF9)
3304 IncrementSequentialIndex (ref category);
3307 private void AddCharMapGroupCJK (char c, ref byte category)
3309 AddCharMapCJK (c, ref category);
3311 // LAMESPEC: see below.
3312 if (c == '\u5B78') {
3313 AddCharMapCJK ('\u32AB', ref category);
3314 AddCharMapCJK ('\u323B', ref category);
3316 if (c == '\u52DE') {
3317 AddCharMapCJK ('\u3298', ref category);
3318 AddCharMapCJK ('\u3238', ref category);
3321 AddCharMapCJK ('\u32A2', ref category);
3323 // Especially this mapping order totally does
3324 // not make sense to me.
3325 AddCharMapCJK ('\u32A9', ref category);
3327 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3330 for (byte weight = 0; weight <= 0x12; weight++) {
3331 object wv = nfkd [weight];
3336 // Special: they are ignored in this area.
3337 // FIXME: check if it is sane
3338 if (0xF900 <= w && w <= 0xFAD9)
3340 // LAMESPEC: on Windows some of CJK characters
3341 // in 3200-32B0 are incorrectly mapped. They
3342 // mix Chinise and Japanese Kanji when
3343 // ordering those characters.
3345 case 0x32A2: case 0x3298: case 0x3238:
3346 case 0x32A9: case 0x323B: case 0x32AB:
3350 AddCharMapCJK ((char) w, ref category);
3354 // For now it is only for 0x7 category.
3355 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3357 char small = char.MinValue;
3358 char vertical = char.MinValue;
3359 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3361 object smv = nfkd [(byte) DecompositionSmall];
3363 small = (char) ((int) smv);
3364 object vv = nfkd [(byte) DecompositionVertical];
3366 vertical = (char) ((int) vv);
3369 // <small> updates index
3370 if (small != char.MinValue)
3371 // SPECIAL CASE excluded (FIXME: why?)
3372 if (small != '\u2024')
3373 AddCharMap (small, category, updateCount);
3376 AddCharMap (c, category, updateCount, level2);
3378 // Since nfkdMap is problematic to have two or more
3379 // NFKD to an identical character, here I iterate all.
3380 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3381 if (decompLength [c2] == 1 &&
3382 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3383 switch (decompType [c2]) {
3384 case DecompositionCompat:
3385 AddCharMap ((char) c2, category, updateCount, level2);
3391 if (vertical != char.MinValue)
3392 // SPECIAL CASE excluded (FIXME: why?)
3393 if (vertical != '\uFE33' && vertical != '\uFE34')
3394 AddCharMap (vertical, category, updateCount, level2);
3397 private void AddArabicCharMap (char c)
3400 byte updateCount = 1;
3404 AddCharMap (c, category, 0, level2);
3406 // Since nfkdMap is problematic to have two or more
3407 // NFKD to an identical character, here I iterate all.
3408 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3409 if (decompLength [c2] == 0)
3411 int idx = decompIndex [c2] + decompLength [c2] - 1;
3412 if ((int) (decompValues [idx]) == (int) c)
3413 AddCharMap ((char) c2, category,
3416 fillIndex [category] += updateCount;
3419 char ToSmallForm (char c)
3421 return ToDecomposed (c, DecompositionSmall, false);
3424 char ToDecomposed (char c, byte d, bool tail)
3426 if (decompType [(int) c] != d)
3428 int idx = decompIndex [(int) c];
3430 idx += decompLength [(int) c] - 1;
3431 return (char) decompValues [idx];
3434 bool ExistsJIS (int cp)
3436 foreach (JISCharacter j in jisJapanese)
3444 #region Level 3 properties (Case/Width)
3446 private byte ComputeLevel3Weight (char c)
3448 byte b = ComputeLevel3WeightRaw (c);
3449 return b > 0 ? (byte) (b + 2) : b;
3452 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3455 if ('\u3192' <= c && c <= '\u319F')
3458 // They have <narrow> NFKD mapping, and on Windows
3459 // those narrow characters are regarded as "normal",
3460 // thus those characters themselves are regarded as
3461 // "wide". grep "<narrow>" and you can pick them up
3462 // (ignoring Kana, Hangul etc.)
3479 if ('\u11A8' <= c && c <= '\u11F9')
3481 if ('\uFFA0' <= c && c <= '\uFFDC')
3483 if ('\u3130' <= c && c <= '\u3164')
3485 if ('\u3165' <= c && c <= '\u318E')
3487 // Georgian Capital letters
3488 if ('\u10A0' <= c && c <= '\u10C5')
3491 if ('\u2776' <= c && c <= '\u277F')
3493 if ('\u2780' <= c && c <= '\u2789')
3495 if ('\u2776' <= c && c <= '\u2793')
3497 if ('\u2160' <= c && c <= '\u216F')
3499 if ('\u2181' <= c && c <= '\u2182')
3502 if ('\u2135' <= c && c <= '\u2138')
3504 if ('\uFE80' <= c && c < '\uFF00') {
3505 // 2(Isolated)/8(Final)/0x18(Medial)
3506 switch (decompType [(int) c]) {
3507 case DecompositionIsolated:
3509 case DecompositionFinal:
3511 case DecompositionMedial:
3516 // actually I dunno the reason why they have weights.
3546 switch (decompType [(int) c]) {
3547 case DecompositionWide: // <wide>
3548 case DecompositionSub: // <sub>
3549 case DecompositionSuper: // <super>
3550 ret |= decompType [(int) c];
3553 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3555 if (isUppercase [(int) c]) // DerivedCoreProperties
3565 static bool IsIgnorable (int i)
3567 if (unicodeAge [i] >= 3.1)
3569 switch (char.GetUnicodeCategory ((char) i)) {
3570 case UnicodeCategory.OtherNotAssigned:
3571 case UnicodeCategory.Format:
3578 // FIXME: In the future use DerivedAge.txt to examine character
3579 // versions and set those ones that have higher version than
3580 // 1.0 as ignorable.
3581 static bool IsIgnorable (int i)
3585 // I guess, those characters are added between
3586 // Unicode 1.0 (LCMapString) and Unicode 3.1
3587 // (UnicodeCategory), so they used to be
3588 // something like OtherNotAssigned as of Unicode 1.1.
3589 case 0x2df: case 0x387:
3590 case 0x3d7: case 0x3d8: case 0x3d9:
3591 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3592 case 0x400: case 0x40d: case 0x450: case 0x45d:
3593 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3594 case 0x653: case 0x654: case 0x655: case 0x66d:
3596 case 0x1e9b: case 0x202f: case 0x20ad:
3597 case 0x20ae: case 0x20af:
3598 case 0x20e2: case 0x20e3:
3599 case 0x2139: case 0x213a: case 0x2183:
3600 case 0x2425: case 0x2426: case 0x2619:
3601 case 0x2670: case 0x2671: case 0x3007:
3602 case 0x3190: case 0x3191:
3603 case 0xfffc: case 0xfffd:
3605 // exceptional characters filtered by the
3606 // following conditions. Originally those exceptional
3607 // ranges are incorrect (they should not be ignored)
3608 // and most of those characters are unfortunately in
3610 case 0x4d8: case 0x4d9:
3611 case 0x4e8: case 0x4e9:
3613 case 0x3036: case 0x303f:
3614 case 0x337b: case 0xfb1e:
3619 // The whole Sinhala characters.
3620 0x0D82 <= i && i <= 0x0DF4
3621 // The whole Tibetan characters.
3622 || 0x0F00 <= i && i <= 0x0FD1
3623 // The whole Myanmar characters.
3624 || 0x1000 <= i && i <= 0x1059
3625 // The whole Etiopic, Cherokee,
3626 // Canadian Syllablic, Ogham, Runic,
3627 // Tagalog, Hanunoo, Philippine,
3628 // Buhid, Tagbanwa, Khmer and Mongorian
3630 || 0x1200 <= i && i <= 0x1DFF
3631 // Greek extension characters.
3632 || 0x1F00 <= i && i <= 0x1FFF
3633 // The whole Braille characters.
3634 || 0x2800 <= i && i <= 0x28FF
3635 // CJK radical characters.
3636 || 0x2E80 <= i && i <= 0x2EF3
3637 // Kangxi radical characters.
3638 || 0x2F00 <= i && i <= 0x2FD5
3639 // Ideographic description characters.
3640 || 0x2FF0 <= i && i <= 0x2FFB
3641 // Bopomofo letter and final
3642 || 0x31A0 <= i && i <= 0x31B7
3643 // White square with quadrant characters.
3644 || 0x25F0 <= i && i <= 0x25F7
3645 // Ideographic telegraph symbols.
3646 || 0x32C0 <= i && i <= 0x32CB
3647 || 0x3358 <= i && i <= 0x3370
3648 || 0x33E0 <= i && i <= 0x33FF
3649 // The whole YI characters.
3650 || 0xA000 <= i && i <= 0xA48C
3651 || 0xA490 <= i && i <= 0xA4C6
3652 // American small ligatures
3653 || 0xFB13 <= i && i <= 0xFB17
3654 // hebrew, arabic, variation selector.
3655 || 0xFB1D <= i && i <= 0xFE2F
3656 // Arabic ligatures.
3657 || 0xFEF5 <= i && i <= 0xFEFC
3658 // FIXME: why are they excluded?
3659 || 0x01F6 <= i && i <= 0x01F9
3660 || 0x0218 <= i && i <= 0x0233
3661 || 0x02A9 <= i && i <= 0x02AD
3662 || 0x02EA <= i && i <= 0x02EE
3663 || 0x0349 <= i && i <= 0x036F
3664 || 0x0488 <= i && i <= 0x048F
3665 || 0x04D0 <= i && i <= 0x04FF
3666 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3667 || 0x06D6 <= i && i <= 0x06ED
3668 || 0x06FA <= i && i <= 0x06FE
3669 || 0x2048 <= i && i <= 0x204D
3670 || 0x20e4 <= i && i <= 0x20ea
3671 || 0x213C <= i && i <= 0x214B
3672 || 0x21EB <= i && i <= 0x21FF
3673 || 0x22F2 <= i && i <= 0x22FF
3674 || 0x237B <= i && i <= 0x239A
3675 || 0x239B <= i && i <= 0x23CF
3676 || 0x24EB <= i && i <= 0x24FF
3677 || 0x2596 <= i && i <= 0x259F
3678 || 0x25F8 <= i && i <= 0x25FF
3679 || 0x2672 <= i && i <= 0x2689
3680 || 0x2768 <= i && i <= 0x2775
3681 || 0x27d0 <= i && i <= 0x27ff
3682 || 0x2900 <= i && i <= 0x2aff
3683 || 0x3033 <= i && i <= 0x303F
3684 || 0x31F0 <= i && i <= 0x31FF
3685 || 0x3250 <= i && i <= 0x325F
3686 || 0x32B1 <= i && i <= 0x32BF
3687 || 0x3371 <= i && i <= 0x337B
3688 || 0xFA30 <= i && i <= 0xFA6A
3692 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3694 case UnicodeCategory.PrivateUse:
3695 case UnicodeCategory.Surrogate:
3697 // ignored by nature
3698 case UnicodeCategory.Format:
3699 case UnicodeCategory.OtherNotAssigned:
3706 // To check IsIgnorable sanity, try the driver below under MS.NET.
3709 public static void Main ()
3711 for (int i = 0; i <= char.MaxValue; i++)
3712 Dump (i, IsIgnorable (i));
3715 static void Dump (int i, bool ignore)
3717 switch (Char.GetUnicodeCategory ((char) i)) {
3718 case UnicodeCategory.PrivateUse:
3719 case UnicodeCategory.Surrogate:
3720 return; // check nothing
3724 string s2 = new string ((char) i, 10);
3725 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3726 if ((ret == 0) == ignore)
3728 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3731 #endregion // IsIgnorable
3733 #region IsIgnorableSymbol
3734 static bool IsIgnorableSymbol (int i)
3736 if (IsIgnorable (i))
3741 case 0x00b5: case 0x01C0: case 0x01C1:
3742 case 0x01C2: case 0x01C3: case 0x01F6:
3743 case 0x01F7: case 0x01F8: case 0x01F9:
3744 case 0x02D0: case 0x02EE: case 0x037A:
3745 case 0x03D7: case 0x03F3:
3746 case 0x0400: case 0x040d:
3747 case 0x0450: case 0x045d:
3748 case 0x048C: case 0x048D:
3749 case 0x048E: case 0x048F:
3750 case 0x0587: case 0x0640: case 0x06E5:
3751 case 0x06E6: case 0x06FA: case 0x06FB:
3752 case 0x06FC: case 0x093D: case 0x0950:
3753 case 0x1E9B: case 0x2139: case 0x3006:
3754 case 0x3033: case 0x3034: case 0x3035:
3755 case 0xFE7E: case 0xFE7F:
3757 case 0x16EE: case 0x16EF: case 0x16F0:
3759 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3760 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3761 case 0x3038: // HANGZHOU NUMERAL TEN
3762 case 0x3039: // HANGZHOU NUMERAL TWENTY
3763 case 0x303a: // HANGZHOU NUMERAL THIRTY
3769 case 0x02B9: case 0x02BA: case 0x02C2:
3770 case 0x02C3: case 0x02C4: case 0x02C5:
3771 case 0x02C8: case 0x02CC: case 0x02CD:
3772 case 0x02CE: case 0x02CF: case 0x02D2:
3773 case 0x02D3: case 0x02D4: case 0x02D5:
3774 case 0x02D6: case 0x02D7: case 0x02DE:
3775 case 0x02E5: case 0x02E6: case 0x02E7:
3776 case 0x02E8: case 0x02E9:
3777 case 0x309B: case 0x309C:
3779 case 0x055A: // American Apos
3780 case 0x05C0: // Hebrew Punct
3781 case 0x0E4F: // Thai FONGMAN
3782 case 0x0E5A: // Thai ANGKHANKHU
3783 case 0x0E5B: // Thai KHOMUT
3785 case 0x09F2: // Bengali Rupee Mark
3786 case 0x09F3: // Bengali Rupee Sign
3788 case 0x221e: // INF.
3797 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3799 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3800 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3805 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3807 case UnicodeCategory.Surrogate:
3808 return false; // inconsistent
3810 case UnicodeCategory.SpacingCombiningMark:
3811 case UnicodeCategory.EnclosingMark:
3812 case UnicodeCategory.NonSpacingMark:
3813 case UnicodeCategory.PrivateUse:
3815 if (0x064B <= i && i <= 0x0652) // Arabic
3819 case UnicodeCategory.Format:
3820 case UnicodeCategory.OtherNotAssigned:
3827 // latin in a circle
3828 0x249A <= i && i <= 0x24E9
3829 || 0x2100 <= i && i <= 0x2132
3831 || 0x3196 <= i && i <= 0x31A0
3833 || 0x3200 <= i && i <= 0x321C
3835 || 0x322A <= i && i <= 0x3243
3837 || 0x3260 <= i && i <= 0x32B0
3838 || 0x32D0 <= i && i <= 0x3357
3839 || 0x337B <= i && i <= 0x33DD
3841 use = !Char.IsLetterOrDigit ((char) i);
3845 // This "Digit" rule is mystery.
3846 // It filters some symbols out.
3847 if (Char.IsLetterOrDigit ((char) i))
3849 if (Char.IsNumber ((char) i))
3851 if (Char.IsControl ((char) i)
3852 || Char.IsSeparator ((char) i)
3853 || Char.IsPunctuation ((char) i))
3855 if (Char.IsSymbol ((char) i))
3858 // FIXME: should check more
3863 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3865 public static void Main ()
3867 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3868 for (int i = 0; i <= char.MaxValue; i++) {
3869 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3870 if (uc == UnicodeCategory.Surrogate)
3873 bool ret = IsIgnorableSymbol (i);
3875 string s1 = "TEST ";
3876 string s2 = "TEST " + (char) i;
3878 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3880 if (ret != (result == 0))
3881 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3882 ret ? "should not ignore" :
3891 static bool IsIgnorableNonSpacing (int i)
3893 if (IsIgnorable (i))
3897 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3898 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3899 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3901 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3902 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3903 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3904 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3905 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3906 case 0x0CCD: case 0x0E4E:
3910 if (0x02b9 <= i && i <= 0x02c5
3911 || 0x02cc <= i && i <= 0x02d7
3912 || 0x02e4 <= i && i <= 0x02ef
3913 || 0x20DD <= i && i <= 0x20E0
3917 if (0x064B <= i && i <= 0x00652
3918 || 0x0941 <= i && i <= 0x0948
3919 || 0x0AC1 <= i && i <= 0x0ACD
3920 || 0x0C3E <= i && i <= 0x0C4F
3921 || 0x0E31 <= i && i <= 0x0E3F
3925 return Char.GetUnicodeCategory ((char) i) ==
3926 UnicodeCategory.NonSpacingMark;
3929 // We can reuse IsIgnorableSymbol testcode
3930 // for IsIgnorableNonSpacing.
3936 public byte Category;
3938 public byte Level2; // It is always single byte.
3939 public bool Defined;
3941 public CharMapEntry (byte category, byte level1, byte level2)
3943 Category = category;
3952 public readonly int CP;
3953 public readonly int JIS;
3955 public JISCharacter (int cp, int cpJIS)
3962 class JISComparer : IComparer
3964 public static readonly JISComparer Instance =
3967 public int Compare (object o1, object o2)
3969 JISCharacter j1 = (JISCharacter) o1;
3970 JISCharacter j2 = (JISCharacter) o2;
3971 return j1.JIS - j2.JIS;
3975 class NonJISCharacter
3977 public readonly int CP;
3978 public readonly string Name;
3980 public NonJISCharacter (int cp, string name)
3987 class NonJISComparer : IComparer
3989 public static readonly NonJISComparer Instance =
3990 new NonJISComparer ();
3992 public int Compare (object o1, object o2)
3994 NonJISCharacter j1 = (NonJISCharacter) o1;
3995 NonJISCharacter j2 = (NonJISCharacter) o2;
3996 return string.CompareOrdinal (j1.Name, j2.Name);
4000 class DecimalDictionaryValueComparer : IComparer
4002 public static readonly DecimalDictionaryValueComparer Instance
4003 = new DecimalDictionaryValueComparer ();
4005 private DecimalDictionaryValueComparer ()
4009 public int Compare (object o1, object o2)
4011 DictionaryEntry e1 = (DictionaryEntry) o1;
4012 DictionaryEntry e2 = (DictionaryEntry) o2;
4013 // FIXME: in case of 0, compare decomposition categories
4014 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4017 int i1 = (int) e1.Key;
4018 int i2 = (int) e2.Key;
4023 class StringDictionaryValueComparer : IComparer
4025 public static readonly StringDictionaryValueComparer Instance
4026 = new StringDictionaryValueComparer ();
4028 private StringDictionaryValueComparer ()
4032 public int Compare (object o1, object o2)
4034 DictionaryEntry e1 = (DictionaryEntry) o1;
4035 DictionaryEntry e2 = (DictionaryEntry) o2;
4036 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4039 int i1 = (int) e1.Key;
4040 int i2 = (int) e2.Key;
4045 class UCAComparer : IComparer
4047 public static readonly UCAComparer Instance
4048 = new UCAComparer ();
4050 private UCAComparer ()
4054 public int Compare (object o1, object o2)
4056 char i1 = (char) o1;
4057 char i2 = (char) o2;
4059 int l1 = CollationElementTable.GetSortKeyCount (i1);
4060 int l2 = CollationElementTable.GetSortKeyCount (i2);
4061 int l = l1 > l2 ? l2 : l1;
4063 for (int i = 0; i < l; i++) {
4064 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4065 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4066 int v = k1.Primary - k2.Primary;
4069 v = k1.Secondary - k2.Secondary;
4072 v = k1.Thirtiary - k2.Thirtiary;
4075 v = k1.Quarternary - k2.Quarternary;
4088 ArrayList items = new ArrayList ();
4090 public Tailoring (int lcid)
4095 public Tailoring (int lcid, int alias)
4102 get { return lcid; }
4106 get { return alias; }
4109 public bool FrenchSort {
4110 get { return frenchSort; }
4111 set { frenchSort = value; }
4114 public void AddDiacriticalMap (byte target, byte replace)
4116 items.Add (new DiacriticalMap (target, replace));
4119 public void AddSortKeyMap (string source, byte [] sortkey)
4121 items.Add (new SortKeyMap (source, sortkey));
4124 public void AddReplacementMap (string source, string replace)
4126 items.Add (new ReplacementMap (source, replace));
4129 public char [] ItemToCharArray ()
4131 ArrayList al = new ArrayList ();
4132 foreach (ITailoringMap m in items)
4133 al.AddRange (m.ToCharArray ());
4134 return al.ToArray (typeof (char)) as char [];
4137 interface ITailoringMap
4139 char [] ToCharArray ();
4142 class DiacriticalMap : ITailoringMap
4144 public readonly byte Target;
4145 public readonly byte Replace;
4147 public DiacriticalMap (byte target, byte replace)
4153 public char [] ToCharArray ()
4155 char [] ret = new char [3];
4156 ret [0] = (char) 02; // kind:DiacriticalMap
4157 ret [1] = (char) Target;
4158 ret [2] = (char) Replace;
4163 class SortKeyMap : ITailoringMap
4165 public readonly string Source;
4166 public readonly byte [] SortKey;
4168 public SortKeyMap (string source, byte [] sortkey)
4174 public char [] ToCharArray ()
4176 char [] ret = new char [Source.Length + 7];
4177 ret [0] = (char) 01; // kind:SortKeyMap
4178 for (int i = 0; i < Source.Length; i++)
4179 ret [i + 1] = Source [i];
4181 for (int i = 0; i < 4; i++)
4182 ret [i + Source.Length + 2] = (char) SortKey [i];
4187 class ReplacementMap : ITailoringMap
4189 public readonly string Source;
4190 public readonly string Replace;
4192 public ReplacementMap (string source, string replace)
4198 public char [] ToCharArray ()
4200 char [] ret = new char [Source.Length + Replace.Length + 3];
4201 ret [0] = (char) 03; // kind:ReplaceMap
4203 for (int i = 0; i < Source.Length; i++)
4204 ret [pos++] = Source [i];
4207 for (int i = 0; i < Replace.Length; i++)
4208 ret [pos++] = Replace [i];