3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
102 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;",
105 "WITH DOT ABOVE;", " MIDDLE DOT;",
106 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
108 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
109 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
110 "WITH OGONEK;", "WITH CEDILLA;",
112 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
113 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
115 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
116 " DIAERESIS AND GRAVE;",
118 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
119 " MACRON AND ACUTE;",
120 " MACRON AND GRAVE;",
122 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
123 " RING ABOVE AND ACUTE",
124 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
125 " CIRCUMFLEX AND TILDE",
126 " TILDE AND DIAERESIS",
129 " CEDILLA AND BREVE",
130 " OGONEK AND MACRON",
133 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
137 " PRECEDED BY APOSTROPHE",
139 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
142 " RETROFLEX;", "DIAERESIS BELOW",
145 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
146 " BREVE BELOW;", " HORN AND GRAVE",
149 " DOT BELOW AND DOT ABOVE",
150 " RIGHT HALF RING", " HORN AND TILDE",
151 " CIRCUMFLEX AND DOT BELOW",
152 " BREVE AND DOT BELOW",
153 " DOT BELOW AND MACRON",
155 " HORN AND HOOK ABOVE",
157 // CIRCLED, PARENTHESIZED and so on
158 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
159 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
160 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
162 byte [] diacriticWeights = new byte [] {
168 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
169 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
171 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
172 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
174 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
175 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
177 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
178 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
180 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
181 0x69, 0x69, 0x6A, 0x6D, 0x6E,
183 // CIRCLED, PARENTHESIZED and so on.
184 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
188 int [] numberSecondaryWeightBounds = new int [] {
189 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
190 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
191 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
192 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
193 0xE50, 0xE60, 0xED0, 0xEE0
196 char [] orderedGurmukhi;
197 char [] orderedGujarati;
198 char [] orderedGeorgian;
199 char [] orderedThaana;
201 static readonly char [] orderedTamilConsonants = new char [] {
202 // based on traditional Tamil consonants, except for
203 // Grantha (where Microsoft breaks traditionalism).
204 // http://www.angelfire.com/empire/thamizh/padanGaL
205 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
206 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
207 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
208 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
211 // cp -> character name (only for some characters)
212 ArrayList sortableCharNames = new ArrayList ();
214 // cp -> arrow value (int)
215 ArrayList arrowValues = new ArrayList ();
217 // cp -> box value (int)
218 ArrayList boxValues = new ArrayList ();
220 // cp -> level1 value
221 Hashtable arabicLetterPrimaryValues = new Hashtable ();
224 Hashtable arabicNameMap = new Hashtable ();
226 // cp -> Hashtable [decompType] -> cp
227 Hashtable nfkdMap = new Hashtable ();
229 // Latin letter -> ArrayList [int]
230 Hashtable latinMap = new Hashtable ();
232 ArrayList jisJapanese = new ArrayList ();
233 ArrayList nonJisJapanese = new ArrayList ();
235 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
236 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
237 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
238 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
239 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
241 byte [] ignorableFlags = new byte [char.MaxValue + 1];
243 static double [] unicodeAge = new double [char.MaxValue + 1];
245 ArrayList tailorings = new ArrayList ();
247 void Run (string [] args)
249 string dirname = args.Length == 0 ? "downloaded" : args [0];
250 ParseSources (dirname);
251 Console.Error.WriteLine ("parse done.");
253 ModifyParsedValues ();
255 Console.Error.WriteLine ("generation done.");
257 Console.Error.WriteLine ("serialization done.");
259 StreamWriter sw = new StreamWriter ("agelog.txt");
260 for (int i = 0; i < char.MaxValue; i++) {
261 bool shouldBe = false;
262 switch (Char.GetUnicodeCategory ((char) i)) {
263 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
264 shouldBe = true; break;
266 if (unicodeAge [i] >= 3.1)
268 //if (IsIgnorable (i) != shouldBe)
269 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
275 byte [] CompressArray (byte [] source, CodePointIndexer i)
277 return (byte []) CodePointIndexer.CompressArray (
278 source, typeof (byte), i);
281 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
283 return (ushort []) CodePointIndexer.CompressArray (
284 source, typeof (ushort), i);
290 SerializeTailorings ();
292 byte [] categories = new byte [map.Length];
293 byte [] level1 = new byte [map.Length];
294 byte [] level2 = new byte [map.Length];
295 byte [] level3 = new byte [map.Length];
296 ushort [] widthCompat = new ushort [map.Length];
297 for (int i = 0; i < map.Length; i++) {
298 categories [i] = map [i].Category;
299 level1 [i] = map [i].Level1;
300 level2 [i] = map [i].Level2;
301 level3 [i] = ComputeLevel3Weight ((char) i);
302 // For Japanese Half-width characters, don't
303 // map widthCompat. It is IgnoreKanaType that
304 // handles those width differences.
305 if (0xFF6D <= i && i <= 0xFF9D)
307 switch (decompType [i]) {
308 case DecompositionNarrow:
309 case DecompositionWide:
310 case DecompositionSuper:
311 case DecompositionSub:
312 // they are always 1 char
313 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
319 ignorableFlags = CompressArray (ignorableFlags,
320 MSCompatUnicodeTableUtil.Ignorable);
321 categories = CompressArray (categories,
322 MSCompatUnicodeTableUtil.Category);
323 level1 = CompressArray (level1,
324 MSCompatUnicodeTableUtil.Level1);
325 level2 = CompressArray (level2,
326 MSCompatUnicodeTableUtil.Level2);
327 level3 = CompressArray (level3,
328 MSCompatUnicodeTableUtil.Level3);
329 widthCompat = (ushort []) CodePointIndexer.CompressArray (
330 widthCompat, typeof (ushort),
331 MSCompatUnicodeTableUtil.WidthCompat);
332 cjkCHS = CompressArray (cjkCHS,
333 MSCompatUnicodeTableUtil.CjkCHS);
334 cjkCHT = CompressArray (cjkCHT,
335 MSCompatUnicodeTableUtil.Cjk);
336 cjkJA = CompressArray (cjkJA,
337 MSCompatUnicodeTableUtil.Cjk);
338 cjkKO = CompressArray (cjkKO,
339 MSCompatUnicodeTableUtil.Cjk);
340 cjkKOlv2 = CompressArray (cjkKOlv2,
341 MSCompatUnicodeTableUtil.Cjk);
344 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
346 MemoryStream ms = new MemoryStream ();
347 BinaryWriter binary = new BinaryWriter (ms);
348 binary.Write (ignorableFlags.Length);
350 for (int i = 0; i < ignorableFlags.Length; i++) {
351 byte value = ignorableFlags [i];
353 Result.Write ("{0},", value);
355 Result.Write ("0x{0:X02},", value);
357 binary.Write (value);
359 if ((i & 0xF) == 0xF)
360 Result.WriteLine ("// {0:X04}", i - 0xF);
362 Result.WriteLine ("};");
366 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
368 binary.Write (categories.Length);
370 for (int i = 0; i < categories.Length; i++) {
371 byte value = categories [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
377 binary.Write (value);
379 if ((i & 0xF) == 0xF)
380 Result.WriteLine ("// {0:X04}", i - 0xF);
382 Result.WriteLine ("};");
385 // Primary weight value
386 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
388 binary.Write (level1.Length);
390 for (int i = 0; i < level1.Length; i++) {
391 byte value = level1 [i];
393 Result.Write ("{0},", value);
395 Result.Write ("0x{0:X02},", value);
397 binary.Write (value);
399 if ((i & 0xF) == 0xF)
400 Result.WriteLine ("// {0:X04}", i - 0xF);
402 Result.WriteLine ("};");
406 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
408 binary.Write (level2.Length);
410 for (int i = 0; i < level2.Length; i++) {
411 byte value = level2 [i];
413 Result.Write ("{0},", value);
415 Result.Write ("0x{0:X02},", value);
417 binary.Write (value);
419 if ((i & 0xF) == 0xF)
420 Result.WriteLine ("// {0:X04}", i - 0xF);
422 Result.WriteLine ("};");
426 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
428 binary.Write (level3.Length);
430 for (int i = 0; i < level3.Length; i++) {
431 byte value = level3 [i];
433 Result.Write ("{0},", value);
435 Result.Write ("0x{0:X02},", value);
437 binary.Write (value);
439 if ((i & 0xF) == 0xF)
440 Result.WriteLine ("// {0:X04}", i - 0xF);
442 Result.WriteLine ("};");
445 // Width insensitivity mappings
446 // (for now it is more lightweight than dumping the
447 // entire NFKD table).
448 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
450 binary.Write (widthCompat.Length);
452 for (int i = 0; i < widthCompat.Length; i++) {
453 ushort value = widthCompat [i];
455 Result.Write ("{0},", value);
457 Result.Write ("0x{0:X02},", value);
459 binary.Write (value);
461 if ((i & 0xF) == 0xF)
462 Result.WriteLine ("// {0:X04}", i - 0xF);
464 Result.WriteLine ("};");
467 using (FileStream fs = File.Create ("../collation.core.bin")) {
468 byte [] array = ms.ToArray ();
469 fs.Write (array, 0, array.Length);
474 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
475 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
476 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
477 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
478 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
481 void SerializeCJK (string name, ushort [] cjk, int max)
483 int offset = 0;//char.MaxValue - cjk.Length;
484 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
486 MemoryStream ms = new MemoryStream ();
487 BinaryWriter binary = new BinaryWriter (ms);
488 binary.Write (cjk.Length);
490 for (int i = 0; i < cjk.Length; i++) {
491 if (i + offset == max)
493 ushort value = cjk [i];
495 Result.Write ("{0},", value);
497 Result.Write ("0x{0:X04},", value);
499 binary.Write (value);
501 if ((i & 0xF) == 0xF)
502 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
504 Result.WriteLine ("};");
507 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
508 byte [] array = ms.ToArray ();
509 fs.Write (array, 0, array.Length);
514 void SerializeCJK (string name, byte [] cjk, int max)
516 int offset = 0;//char.MaxValue - cjk.Length;
517 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
519 MemoryStream ms = new MemoryStream ();
520 BinaryWriter binary = new BinaryWriter (ms);
522 for (int i = 0; i < cjk.Length; i++) {
523 if (i + offset == max)
525 byte value = cjk [i];
527 Result.Write ("{0},", value);
529 Result.Write ("0x{0:X02},", value);
531 binary.Write (value);
533 if ((i & 0xF) == 0xF)
534 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
536 Result.WriteLine ("};");
539 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
540 byte [] array = ms.ToArray ();
541 fs.Write (array, 0, array.Length);
546 void SerializeTailorings ()
548 Hashtable indexes = new Hashtable ();
549 Hashtable counts = new Hashtable ();
550 Result.WriteLine ("static char [] tailorings = new char [] {");
553 MemoryStream ms = new MemoryStream ();
554 BinaryWriter binary = new BinaryWriter (ms);
556 foreach (Tailoring t in tailorings) {
559 Result.Write ("/*{0}*/", t.LCID);
560 indexes.Add (t.LCID, count);
561 char [] values = t.ItemToCharArray ();
562 counts.Add (t.LCID, values.Length);
563 foreach (char c in values) {
564 Result.Write ("'\\x{0:X}', ", (int) c);
565 if (++count % 16 == 0)
566 Result.WriteLine (" // {0:X04}", count - 16);
568 binary.Write ((ushort) c);
572 Result.WriteLine ("};");
574 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
576 byte [] rawdata = ms.ToArray ();
577 ms = new MemoryStream ();
578 binary = new BinaryWriter (ms);
579 binary.Write (tailorings.Count);
581 foreach (Tailoring t in tailorings) {
582 int target = t.Alias != 0 ? t.Alias : t.LCID;
583 if (!indexes.ContainsKey (target)) {
584 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
587 int idx = (int) indexes [target];
588 int cnt = (int) counts [target];
589 bool french = t.FrenchSort;
591 foreach (Tailoring t2 in tailorings)
592 if (t2.LCID == t.LCID)
593 french = t2.FrenchSort;
594 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
596 binary.Write (t.LCID);
599 binary.Write (french);
602 Result.WriteLine ("};");
604 binary.Write ((byte) 0xFF);
605 binary.Write ((byte) 0xFF);
606 binary.Write (rawdata.Length / 2);
607 binary.Write (rawdata, 0, rawdata.Length);
610 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
611 byte [] array = ms.ToArray ();
612 fs.Write (array, 0, array.Length);
619 void ParseSources (string dirname)
622 dirname + "/UnicodeData.txt";
623 string derivedCoreProps =
624 dirname + "/DerivedCoreProperties.txt";
626 dirname + "/Scripts.txt";
628 dirname + "/CP932.TXT";
630 dirname + "/DerivedAge.txt";
631 string chXML = dirname + "/common/collation/zh.xml";
632 string jaXML = dirname + "/common/collation/ja.xml";
633 string koXML = dirname + "/common/collation/ko.xml";
635 ParseDerivedAge (derivedAge);
639 ParseJISOrder (cp932); // in prior to ParseUnidata()
640 ParseUnidata (unidata);
642 ParseDerivedCoreProperties (derivedCoreProps);
643 ParseScripts (scripts);
644 ParseCJK (chXML, jaXML, koXML);
646 ParseTailorings ("mono-tailoring-source.txt");
649 void ParseTailorings (string filename)
653 using (StreamReader sr = new StreamReader (filename)) {
655 while (sr.Peek () >= 0) {
657 ProcessTailoringLine (ref t,
658 sr.ReadLine ().Trim ());
660 } catch (Exception) {
661 Console.Error.WriteLine ("ERROR at line {0}", line);
667 // For now this is enough.
668 string ParseTailoringSourceValue (string s)
670 StringBuilder sb = new StringBuilder ();
671 for (int i = 0; i < s.Length; i++) {
672 if (s.StartsWith ("\\u")) {
673 sb.Append ((char) int.Parse (
674 s.Substring (2, 4), NumberStyles.HexNumber),
681 return sb.ToString ();
684 void ProcessTailoringLine (ref Tailoring t, string s)
686 int idx = s.IndexOf ('#');
688 s = s.Substring (0, idx).Trim ();
689 if (s.Length == 0 || s [0] == '#')
692 idx = s.IndexOf ('=');
695 int.Parse (s.Substring (1, idx - 1)),
696 int.Parse (s.Substring (idx + 1)));
698 t = new Tailoring (int.Parse (s.Substring (1)));
702 if (s.StartsWith ("*FrenchSort")) {
706 string d = "*Diacritical";
707 if (s.StartsWith (d)) {
708 idx = s.IndexOf ("->");
709 t.AddDiacriticalMap (
710 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
711 NumberStyles.HexNumber),
712 byte.Parse (s.Substring (idx + 2).Trim (),
713 NumberStyles.HexNumber));
716 idx = s.IndexOf (':');
718 string source = s.Substring (0, idx).Trim ();
719 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
720 byte [] b = new byte [4];
721 for (int i = 0; i < 4; i++) {
725 b [i] = byte.Parse (l [i],
726 NumberStyles.HexNumber);
728 t.AddSortKeyMap (ParseTailoringSourceValue (source),
731 idx = s.IndexOf ('=');
733 t.AddReplacementMap (
734 ParseTailoringSourceValue (
735 s.Substring (0, idx).Trim ()),
736 ParseTailoringSourceValue (
737 s.Substring (idx + 1).Trim ()));
740 void ParseDerivedAge (string filename)
742 using (StreamReader file =
743 new StreamReader (filename)) {
744 while (file.Peek () >= 0) {
745 string s = file.ReadLine ();
746 int idx = s.IndexOf ('#');
748 s = s.Substring (0, idx);
749 idx = s.IndexOf (';');
753 string cpspec = s.Substring (0, idx);
754 idx = cpspec.IndexOf ("..");
755 NumberStyles nf = NumberStyles.HexNumber |
756 NumberStyles.AllowTrailingWhite;
757 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
758 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
759 string value = s.Substring (cpspec.Length + 1).Trim ();
762 if (cp > char.MaxValue)
765 double v = double.Parse (value);
766 for (int i = cp; i <= cpEnd; i++)
770 unicodeAge [0] = double.MaxValue; // never be supported
773 void ParseUnidata (string filename)
775 ArrayList decompValues = new ArrayList ();
776 using (StreamReader unidata =
777 new StreamReader (filename)) {
778 for (int line = 1; unidata.Peek () >= 0; line++) {
780 ProcessUnidataLine (unidata.ReadLine (), decompValues);
781 } catch (Exception) {
782 Console.Error.WriteLine ("**** At line " + line);
787 this.decompValues = (int [])
788 decompValues.ToArray (typeof (int));
791 char previousLatinTarget = char.MinValue;
792 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
794 void ProcessUnidataLine (string s, ArrayList decompValues)
796 int idx = s.IndexOf ('#');
798 s = s.Substring (0, idx);
799 idx = s.IndexOf (';');
802 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
803 string [] values = s.Substring (idx + 1).Split (';');
806 if (cp > char.MaxValue)
808 if (IsIgnorable (cp))
811 string name = values [0];
813 // SPECIAL CASE: rename some characters for diacritical
814 // remapping. FIXME: why are they different?
815 // FIXME: it's still not working.
816 if (cp == 0x018B || cp == 0x018C)
817 name = name.Replace ("TOPBAR", "STROKE");
820 if (s.IndexOf ("SMALL CAPITAL") > 0)
821 isSmallCapital [cp] = true;
823 // latin mapping by character name
824 if (s.IndexOf ("LATIN") >= 0) {
825 int lidx = s.IndexOf ("LETTER DOTLESS ");
826 int offset = lidx + 15;
828 lidx = s.IndexOf ("LETTER TURNED ");
832 lidx = s.IndexOf ("LETTER CAPITAL ");
836 lidx = s.IndexOf ("LETTER SCRIPT ");
840 lidx = s.IndexOf ("LETTER ");
843 char c = lidx > 0 ? s [offset] : char.MinValue;
844 char n = s [offset + 1];
845 char target = char.MinValue;
846 if ('A' <= c && c <= 'Z' &&
847 (n == ' ') || n == ';') {
849 // FIXME: After 'Z', I cannot reset this state.
850 previousLatinTarget = c == 'Z' ? char.MinValue : c;
853 if (s.Substring (offset).StartsWith ("ALPHA"))
855 else if (s.Substring (offset).StartsWith ("TONE SIX"))
857 else if (s.Substring (offset).StartsWith ("OPEN O"))
859 else if (s.Substring (offset).StartsWith ("SCHWA"))
861 else if (s.Substring (offset).StartsWith ("ENG"))
863 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
865 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
867 else if (s.Substring (offset).StartsWith ("TONE TWO"))
869 else if (s.Substring (offset).StartsWith ("ESH"))
872 // For remaining IPA chars, direct mapping is
875 case 0x0299: target = 'B'; break;
876 case 0x029A: target = 'E'; break;
877 case 0x029B: target = 'G'; break;
878 case 0x029C: target = 'H'; break;
879 case 0x029D: target = 'J'; break;
880 case 0x029E: target = 'K'; break;
881 case 0x029F: target = 'L'; break;
882 case 0x02A0: target = 'Q'; break;
883 case 0x02A7: target = 'T'; break;
884 case 0x02A8: target = 'T'; break;
887 if (target == char.MinValue)
888 target = previousLatinTarget;
890 if (target != char.MinValue) {
891 ArrayList entry = (ArrayList) latinMap [target];
893 entry = new ArrayList ();
894 latinMap [target] = entry;
897 // FIXME: This secondary weight is hack.
898 // They are here because they must not
899 // be identical to the corresponding
901 if (c != target && diacritical [cp] == 0) {
902 diacriticalOffset [c - 'A']++;
903 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
909 if (0x2000 <= cp && cp < 0x3000) {
911 // SPECIAL CASES. FIXME: why?
913 case 0x21C5: value = -1; break; // E2
914 case 0x261D: value = 1; break;
915 case 0x27A6: value = 3; break;
916 case 0x21B0: value = 7; break;
917 case 0x21B1: value = 3; break;
918 case 0x21B2: value = 7; break;
919 case 0x21B4: value = 5; break;
920 case 0x21B5: value = 7; break;
921 case 0x21B9: value = -1; break; // E1
922 case 0x21CF: value = 7; break;
923 case 0x21D0: value = 3; break;
925 string [] arrowTargets = new string [] {
937 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
938 if (s.IndexOf (arrowTargets [i]) > 0 &&
939 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
940 s.IndexOf (" OVER") < 0
944 arrowValues.Add (new DictionaryEntry (
949 if (0x2500 <= cp && cp < 0x2600) {
952 // up:1 down:2 right:4 left:8 vert:16 horiz:32
955 // [dr] [dl] [ur] [ul]
959 ArrayList flags = new ArrayList (new int [] {
962 4 + 2, 8 + 2, 4 + 1, 8 + 1,
963 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
964 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
965 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
967 byte [] offsets = new byte [] {
974 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
976 if (s.IndexOf (" UP") >= 0)
978 if (s.IndexOf (" DOWN") >= 0)
980 if (s.IndexOf (" RIGHT") >= 0)
982 if (s.IndexOf (" LEFT") >= 0)
984 if (s.IndexOf (" VERTICAL") >= 0)
986 if (s.IndexOf (" HORIZONTAL") >= 0)
989 int fidx = flags.IndexOf (flag);
990 value = fidx < 0 ? fidx : offsets [fidx];
991 } else if (s.IndexOf ("BLOCK") >= 0) {
992 if (s.IndexOf ("ONE EIGHTH") >= 0)
994 else if (s.IndexOf ("ONE QUARTER") >= 0)
996 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
998 else if (s.IndexOf ("HALF") >= 0)
1000 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1002 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1004 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1009 else if (s.IndexOf ("SHADE") >= 0)
1011 else if (s.IndexOf ("SQUARE") >= 0)
1012 value = 0xBC - 0xE5;
1013 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1014 value = 0xBE - 0xE5;
1015 else if (s.IndexOf ("RECTANGLE") >= 0)
1016 value = 0xBD - 0xE5;
1017 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1018 value = 0xBF - 0xE5;
1019 else if (s.IndexOf ("TRIANGLE") >= 0) {
1020 if (s.IndexOf ("UP-POINTING") >= 0)
1021 value = 0xC0 - 0xE5;
1022 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1023 value = 0xC1 - 0xE5;
1024 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1025 value = 0xC2 - 0xE5;
1026 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1027 value = 0xC3 - 0xE5;
1029 else if (s.IndexOf ("POINTER") >= 0) {
1030 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1031 value = 0xC4 - 0xE5;
1032 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1033 value = 0xC5 - 0xE5;
1035 else if (s.IndexOf ("DIAMOND") >= 0)
1036 value = 0xC6 - 0xE5;
1037 else if (s.IndexOf ("FISHEYE") >= 0)
1038 value = 0xC7 - 0xE5;
1039 else if (s.IndexOf ("LOZENGE") >= 0)
1040 value = 0xC8 - 0xE5;
1041 else if (s.IndexOf ("BULLSEYE") >= 0)
1042 value = 0xC9 - 0xE5;
1043 else if (s.IndexOf ("CIRCLE") >= 0) {
1044 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1045 value = 0xCA - 0xE5;
1046 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1047 value = 0xCB - 0xE5;
1049 value = 0xC9 - 0xE5;
1051 if (0x25DA <= cp && cp <= 0x25E5)
1052 value = 0xCD + cp - 0x25DA - 0xE5;
1054 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1056 case 0x2571: value = 0xF; break;
1057 case 0x2572: value = 0x10; break;
1058 case 0x2573: value = 0x11; break;
1061 boxValues.Add (new DictionaryEntry (
1065 // For some characters store the name and sort later
1066 // to determine sorting.
1067 if (0x2100 <= cp && cp <= 0x213F &&
1068 Char.IsSymbol ((char) cp))
1069 sortableCharNames.Add (
1070 new DictionaryEntry (cp, name));
1071 else if (0x3380 <= cp && cp <= 0x33DD)
1072 sortableCharNames.Add (new DictionaryEntry (
1073 cp, name.Substring (7)));
1075 if (Char.GetUnicodeCategory ((char) cp) ==
1076 UnicodeCategory.MathSymbol) {
1077 if (name.StartsWith ("CIRCLED "))
1078 diacritical [cp] = 0xEE;
1079 if (name.StartsWith ("SQUARED "))
1080 diacritical [cp] = 0xEF;
1083 // diacritical weights by character name
1084 if (diacritics.Length != diacriticWeights.Length)
1085 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1086 for (int d = 0; d < diacritics.Length; d++) {
1087 if (s.IndexOf (diacritics [d]) > 0) {
1088 diacritical [cp] += diacriticWeights [d];
1089 if (s.IndexOf ("COMBINING") >= 0)
1090 diacritical [cp] -= (byte) 2;
1093 // also process "COMBINING blah" here
1094 // For now it is limited to cp < 0x0370
1095 // if (cp < 0x0300 || cp >= 0x0370)
1097 string tmp = diacritics [d].TrimEnd (';');
1098 if (tmp.IndexOf ("WITH ") == 0)
1099 tmp = tmp.Substring (4);
1100 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1102 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1106 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1108 // Two-step grep required for it.
1109 if (s.IndexOf ("FULL STOP") > 0 &&
1110 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1111 diacritical [cp] |= 0xF4;
1112 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1113 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1114 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1116 // Arabic letter name
1117 if (0x0621 <= cp && cp <= 0x064A &&
1118 Char.GetUnicodeCategory ((char) cp)
1119 == UnicodeCategory.OtherLetter) {
1120 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1125 // hamza, waw, yeh ... special cases.
1130 value = 0x77; // special cases.
1133 // Get primary letter name i.e.
1134 // XXX part of ARABIC LETTER XXX yyy
1135 // e.g. that of "TEH MARBUTA" is "TEH".
1138 // 0x0640 is special: it does
1139 // not start with ARABIC LETTER
1141 name.Substring (14);
1142 int tmpIdx = letterName.IndexOf (' ');
1143 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1144 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1145 if (arabicNameMap.ContainsKey (letterName))
1146 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1148 arabicNameMap [letterName] = cp;
1151 arabicLetterPrimaryValues [cp] = value;
1154 // Japanese square letter
1155 if (0x3300 <= cp && cp <= 0x3357)
1156 if (!ExistsJIS (cp))
1157 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1159 // normalizationType
1160 string decomp = values [4];
1161 idx = decomp.IndexOf ('<');
1163 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1165 decompType [cp] = DecompositionFull;
1168 decompType [cp] = DecompositionSub;
1171 decompType [cp] = DecompositionSuper;
1174 decompType [cp] = DecompositionSmall;
1177 decompType [cp] = DecompositionIsolated;
1180 decompType [cp] = DecompositionInitial;
1183 decompType [cp] = DecompositionFinal;
1186 decompType [cp] = DecompositionMedial;
1189 decompType [cp] = DecompositionNoBreak;
1192 decompType [cp] = DecompositionCompat;
1195 decompType [cp] = DecompositionFraction;
1198 decompType [cp] = DecompositionFont;
1201 decompType [cp] = DecompositionCircle;
1204 decompType [cp] = DecompositionSquare;
1207 decompType [cp] = DecompositionWide;
1210 decompType [cp] = DecompositionNarrow;
1213 decompType [cp] = DecompositionVertical;
1216 throw new Exception ("Support NFKD type : " + decomp);
1220 decompType [cp] = DecompositionCanonical;
1221 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1222 if (decomp.Length > 0) {
1224 string [] velems = decomp.Split (' ');
1225 int didx = decompValues.Count;
1226 decompIndex [cp] = didx;
1227 foreach (string v in velems)
1228 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1229 decompLength [cp] = velems.Length;
1231 // [decmpType] -> this_cp
1232 int targetCP = (int) decompValues [didx];
1233 // for "(x)" it specially maps to 'x' .
1234 // FIXME: check if it is sane
1235 if (velems.Length == 3 &&
1236 (int) decompValues [didx] == '(' &&
1237 (int) decompValues [didx + 2] == ')')
1238 targetCP = (int) decompValues [didx + 1];
1239 // special: 0x215F "1/"
1240 else if (cp == 0x215F)
1242 else if (velems.Length > 1 &&
1243 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1244 // skip them, except for CJK ideograph compat
1247 if (targetCP != 0) {
1248 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1249 if (entry == null) {
1250 entry = new Hashtable ();
1251 nfkdMap [targetCP] = entry;
1253 entry [(byte) decompType [cp]] = cp;
1257 if (values [5].Length > 0)
1258 decimalValue [cp] = decimal.Parse (values [5]);
1259 else if (values [6].Length > 0)
1260 decimalValue [cp] = decimal.Parse (values [6]);
1261 else if (values [7].Length > 0) {
1262 string decstr = values [7];
1263 idx = decstr.IndexOf ('/');
1264 if (cp == 0x215F) // special. "1/"
1265 decimalValue [cp] = 0x1;
1269 decimal.Parse (decstr.Substring (0, idx))
1270 / decimal.Parse (decstr.Substring (idx + 1));
1271 else if (decstr [0] == '(' &&
1272 decstr [decstr.Length - 1] == ')')
1275 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1276 else if (decstr [decstr.Length - 1] == '.')
1279 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1281 decimalValue [cp] = decimal.Parse (decstr);
1285 void ParseDerivedCoreProperties (string filename)
1288 using (StreamReader file =
1289 new StreamReader (filename)) {
1290 for (int line = 1; file.Peek () >= 0; line++) {
1292 ProcessDerivedCorePropLine (file.ReadLine ());
1293 } catch (Exception) {
1294 Console.Error.WriteLine ("**** At line " + line);
1301 void ProcessDerivedCorePropLine (string s)
1303 int idx = s.IndexOf ('#');
1305 s = s.Substring (0, idx);
1306 idx = s.IndexOf (';');
1309 string cpspec = s.Substring (0, idx);
1310 idx = cpspec.IndexOf ("..");
1311 NumberStyles nf = NumberStyles.HexNumber |
1312 NumberStyles.AllowTrailingWhite;
1313 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1314 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1315 string value = s.Substring (cpspec.Length + 1).Trim ();
1318 if (cp > char.MaxValue)
1323 for (int x = cp; x <= cpEnd; x++)
1324 isUppercase [x] = true;
1329 void ParseScripts (string filename)
1331 ArrayList gurmukhi = new ArrayList ();
1332 ArrayList gujarati = new ArrayList ();
1333 ArrayList georgian = new ArrayList ();
1334 ArrayList thaana = new ArrayList ();
1336 using (StreamReader file =
1337 new StreamReader (filename)) {
1338 while (file.Peek () >= 0) {
1339 string s = file.ReadLine ();
1340 int idx = s.IndexOf ('#');
1342 s = s.Substring (0, idx);
1343 idx = s.IndexOf (';');
1347 string cpspec = s.Substring (0, idx);
1348 idx = cpspec.IndexOf ("..");
1349 NumberStyles nf = NumberStyles.HexNumber |
1350 NumberStyles.AllowTrailingWhite;
1351 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1352 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1353 string value = s.Substring (cpspec.Length + 1).Trim ();
1356 if (cp > char.MaxValue)
1361 for (int x = cp; x <= cpEnd; x++)
1362 if (!IsIgnorable (x))
1363 gurmukhi.Add ((char) x);
1366 for (int x = cp; x <= cpEnd; x++)
1367 if (!IsIgnorable (x))
1368 gujarati.Add ((char) x);
1371 for (int x = cp; x <= cpEnd; x++)
1372 if (!IsIgnorable (x))
1373 georgian.Add ((char) x);
1376 for (int x = cp; x <= cpEnd; x++)
1377 if (!IsIgnorable (x))
1378 thaana.Add ((char) x);
1383 gurmukhi.Sort (UCAComparer.Instance);
1384 gujarati.Sort (UCAComparer.Instance);
1385 georgian.Sort (UCAComparer.Instance);
1386 thaana.Sort (UCAComparer.Instance);
1387 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1388 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1389 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1390 orderedThaana = (char []) thaana.ToArray (typeof (char));
1393 void ParseJISOrder (string filename)
1397 using (StreamReader file =
1398 new StreamReader (filename)) {
1399 for (;file.Peek () >= 0; line++)
1400 ProcessJISOrderLine (file.ReadLine ());
1402 } catch (Exception) {
1403 Console.Error.WriteLine ("---- line {0}", line);
1408 char [] ws = new char [] {'\t', ' '};
1410 void ProcessJISOrderLine (string s)
1412 int idx = s.IndexOf ('#');
1414 s = s.Substring (0, idx).Trim ();
1417 idx = s.IndexOfAny (ws);
1420 // They start with "0x" so cut them out.
1421 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1422 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1423 jisJapanese.Add (new JISCharacter (cp, jis));
1426 void ParseCJK (string zhXML, string jaXML, string koXML)
1428 XmlDocument doc = new XmlDocument ();
1429 doc.XmlResolver = null;
1436 // Chinese Simplified
1439 offset = 0;//char.MaxValue - arr.Length;
1441 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1443 foreach (char c in s) {
1445 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1447 arr [(int) c - offset] = (ushort) v++;
1453 // Chinese Traditional
1456 offset = 0;//char.MaxValue - arr.Length;
1457 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1459 foreach (char c in s) {
1461 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1463 arr [(int) c - offset] = (ushort) v++;
1472 offset = 0;//char.MaxValue - arr.Length;
1475 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1476 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1477 arr [0x337E] = 0x8005;
1478 arr [0x337D] = 0x8006;
1479 arr [0x337C] = 0x8007;
1482 foreach (JISCharacter jc in jisJapanese) {
1483 if (jc.JIS < 0x8800)
1485 char c = (char) jc.CP;
1488 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1491 arr [(int) c - offset] = (ushort) v++;
1496 if (c == '\u662D') // U+337C
1498 if (c == '\u5927') // U+337D
1500 if (c == '\u5E73') // U+337B
1502 if (c == '\u660E') // U+337E
1504 if (c == '\u9686') // U+F9DC
1507 // FIXME: there are still remaining
1508 // characters after U+FA0C.
1509 // for (int k = 0; k < char.MaxValue; k++) {
1510 for (int k = 0; k < '\uFA0D'; k++) {
1511 if (decompIndex [k] == 0 || IsIgnorable (k))
1513 if (decompValues [decompIndex [k]] == c /*&&
1514 decompLength [k] == 1*/ ||
1515 decompLength [k] == 3 &&
1516 decompValues [decompIndex [k] + 1] == c) {
1517 arr [k - offset] = (ushort) v++;
1526 // Korean weight is somewhat complex. It first shifts
1527 // Hangul category from 52-x to 80-x (they are anyways
1528 // computed). CJK ideographs are placed at secondary
1529 // weight, like XX YY 01 zz 01, where XX and YY are
1530 // corresponding "reset" value and zz is 41,43,45...
1532 // Unlike chs,cht and ja, Korean value is a combined
1533 // ushort which is computed as category
1537 offset = 0;//char.MaxValue - arr.Length;
1539 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1540 XmlElement sc = (XmlElement) reset.NextSibling;
1541 // compute "category" and "level 1" for the
1542 // target "reset" Hangle syllable
1543 char rc = reset.InnerText [0];
1544 int ri = ((int) rc - 0xAC00) + 1;
1546 ((ri / 254) * 256 + (ri % 254) + 2);
1547 // Place the characters after the target.
1550 foreach (char c in s) {
1551 arr [(int) c - offset] = p;
1552 cjkKOlv2 [(int) c - offset] = (byte) v;
1562 void FillIgnorables ()
1564 for (int i = 0; i <= char.MaxValue; i++) {
1565 if (Char.GetUnicodeCategory ((char) i) ==
1566 UnicodeCategory.OtherNotAssigned)
1568 if (IsIgnorable (i))
1569 ignorableFlags [i] |= 1;
1570 if (IsIgnorableSymbol (i))
1571 ignorableFlags [i] |= 2;
1572 if (IsIgnorableNonSpacing (i))
1573 ignorableFlags [i] |= 4;
1577 void ModifyUnidata ()
1579 // Modify some decomposition equivalence
1580 decompType [0xFE31] = 0;
1581 decompIndex [0xFE31] = 0;
1582 decompLength [0xFE31] = 0;
1583 decompType [0xFE32] = 0;
1584 decompIndex [0xFE32] = 0;
1585 decompLength [0xFE32] = 0;
1587 // Korean parens numbers
1588 for (int i = 0x3200; i <= 0x321C; i++)
1589 diacritical [i] = 0xA;
1590 for (int i = 0x3260; i <= 0x327B; i++)
1591 diacritical [i] = 0xC;
1593 // LAMESPEC: these remapping should not be done.
1594 // Windows have incorrect CJK compat mappings.
1595 decompValues [decompIndex [0x32A9]] = 0x91AB;
1596 decompLength [0x323B] = 1;
1597 decompValues [decompIndex [0x323B]] = 0x5B78;
1598 decompValues [decompIndex [0x32AB]] = 0x5B78;
1599 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1600 decompLength [0x3238] = 1;
1601 decompValues [decompIndex [0x3238]] = 0x52DE;
1602 decompValues [decompIndex [0x3298]] = 0x52DE;
1604 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1605 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1606 decompValues [decompIndex [0xFA0C]] = 0x5140;
1607 decompLength [0xFA0C] = 1;
1608 decompIndex [0xF929] = decompLength [0xF929] = 0;
1610 decompValues [decompIndex [0xF92C]] = 0x90DE;
1613 void ModifyParsedValues ()
1615 // some cyrillic diacritical weight. They seem to be
1616 // based on old character names, so it's quicker to
1617 // set them directly here.
1618 diacritical [0x0496] = diacritical [0x0497] = 7;
1619 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1620 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1621 diacritical [0x049C] = diacritical [0x049D] = 9;
1622 diacritical [0x049E] = diacritical [0x049F] = 4;
1623 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1624 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1625 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1627 // number, secondary weights
1629 int [] numarr = numberSecondaryWeightBounds;
1630 for (int i = 0; i < numarr.Length; i += 2, weight++)
1631 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1632 if (Char.IsNumber ((char) cp))
1633 diacritical [cp] = weight;
1635 // Update name part of named characters
1636 for (int i = 0; i < sortableCharNames.Count; i++) {
1637 DictionaryEntry de =
1638 (DictionaryEntry) sortableCharNames [i];
1639 int cp = (int) de.Key;
1640 string renamed = null;
1642 case 0x2101: renamed = "A_1"; break;
1643 case 0x33C3: renamed = "A_2"; break;
1644 case 0x2105: renamed = "C_1"; break;
1645 case 0x2106: renamed = "C_2"; break;
1646 case 0x211E: renamed = "R1"; break;
1647 case 0x211F: renamed = "R2"; break;
1648 // Remove some of them!
1659 sortableCharNames.RemoveAt (i);
1663 if (renamed != null)
1664 sortableCharNames [i] =
1665 new DictionaryEntry (cp, renamed);
1669 void GenerateCore ()
1673 #region Specially ignored // 01
1674 // This will raise "Defined" flag up.
1675 // FIXME: Check If it is really fine. Actually for
1676 // Japanese voice marks this code does remapping.
1677 foreach (char c in specialIgnore)
1678 map [(int) c] = new CharMapEntry (0, 0, 0);
1681 #region Extenders (FF FF)
1682 fillIndex [0xFF] = 0xFF;
1683 char [] specialBiggest = new char [] {
1684 '\u3005', '\u3031', '\u3032', '\u309D',
1685 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1686 '\uFE7C', '\uFE7D', '\uFF70'};
1687 foreach (char c in specialBiggest)
1688 AddCharMap (c, 0xFF, 0);
1691 #region Variable weights
1692 // Controls : 06 03 - 06 3D
1693 fillIndex [0x6] = 3;
1694 for (int i = 0; i < 65536; i++) {
1695 if (IsIgnorable (i))
1698 uc = Char.GetUnicodeCategory (c);
1699 // NEL is whitespace but not ignored here.
1700 if (uc == UnicodeCategory.Control &&
1701 !Char.IsWhiteSpace (c) || c == '\u0085')
1702 AddCharMap (c, 6, 1);
1706 fillIndex [0x6] = 0x80;
1707 AddCharMap ('\'', 6, 0);
1708 AddCharMap ('\uFF07', 6, 1);
1709 AddCharMap ('\uFE63', 6, 1);
1711 // SPECIAL CASE: fill FE32 here in prior to be added
1712 // at 2013. Windows does not always respect NFKD.
1713 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1715 // Hyphen/Dash : 06 81 - 06 90
1716 for (int i = 0; i < char.MaxValue; i++) {
1717 if (!IsIgnorable (i) &&
1718 Char.GetUnicodeCategory ((char) i) ==
1719 UnicodeCategory.DashPunctuation) {
1720 AddCharMapGroup2 ((char) i, 6, 1, 0);
1722 // SPECIAL: add 2027 and 2043
1723 // Maybe they are regarded the
1724 // same hyphens in "central"
1726 AddCharMap ('\u2027', 6, 1);
1727 AddCharMap ('\u2043', 6, 1);
1731 // They are regarded as primarily equivalent to '-'
1732 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1733 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1734 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1736 // Arabic variable weight chars 06 A0 -
1737 fillIndex [6] = 0xA0;
1739 for (int i = 0x64B; i <= 0x650; i++)
1740 AddArabicCharMap ((char) i);
1742 AddCharMapGroup ('\u0652', 6, 1, 0);
1744 AddCharMapGroup ('\u0651', 6, 1, 0);
1748 #region Nonspacing marks // 01
1749 // FIXME: 01 03 - 01 B6 ... annoyance :(
1751 // Combining diacritical marks: 01 DC -
1753 fillIndex [0x1] = 0x41;
1754 for (int i = 0x030E; i <= 0x0326; i++)
1755 if (!IsIgnorable (i))
1756 AddCharMap ((char) i, 0x1, 1);
1757 for (int i = 0x0329; i <= 0x0334; i++)
1758 if (!IsIgnorable (i))
1759 AddCharMap ((char) i, 0x1, 1);
1761 for (int i = 0x0339; i <= 0x0341; i++)
1762 if (!IsIgnorable (i))
1763 AddCharMap ((char) i, 0x1, 1);
1764 fillIndex [0x1] = 0x74;
1765 for (int i = 0x0346; i <= 0x0348; i++)
1766 if (!IsIgnorable (i))
1767 AddCharMap ((char) i, 0x1, 1);
1768 for (int i = 0x02BE; i <= 0x02BF; i++)
1769 if (!IsIgnorable (i))
1770 AddCharMap ((char) i, 0x1, 1);
1771 for (int i = 0x02C1; i <= 0x02C5; i++)
1772 if (!IsIgnorable (i))
1773 AddCharMap ((char) i, 0x1, 1);
1774 for (int i = 0x02CE; i <= 0x02CF; i++)
1775 if (!IsIgnorable (i))
1776 AddCharMap ((char) i, 0x1, 1);
1778 for (int i = 0x02D1; i <= 0x02D3; i++)
1779 if (!IsIgnorable (i))
1780 AddCharMap ((char) i, 0x1, 1);
1781 AddCharMap ('\u02DE', 0x1, 1);
1782 for (int i = 0x02E4; i <= 0x02E9; i++)
1783 if (!IsIgnorable (i))
1784 AddCharMap ((char) i, 0x1, 1);
1786 // FIXME: needs more love here (it should eliminate
1787 // all the hacky code above).
1788 for (int i = 0x0300; i < 0x0370; i++)
1789 if (!IsIgnorable (i) && diacritical [i] != 0
1790 /* especiall here*/ && !map [i].Defined)
1791 map [i] = new CharMapEntry (
1792 0x1, 0x1, diacritical [i]);
1794 // Cyrillic and Armenian nonspacing mark
1795 fillIndex [0x1] = 0x94;
1796 for (int i = 0x400; i < 0x580; i++)
1797 if (!IsIgnorable (i) &&
1798 Char.GetUnicodeCategory ((char) i) ==
1799 UnicodeCategory.NonSpacingMark)
1800 AddCharMap ((char) i, 1, 1);
1802 fillIndex [0x1] = 0x8D;
1803 // syriac dotted nonspacing marks (1)
1804 AddCharMap ('\u0740', 0x1, 1);
1805 AddCharMap ('\u0741', 0x1, 1);
1806 AddCharMap ('\u0742', 0x1, 1);
1807 // syriac oblique nonspacing marks
1808 AddCharMap ('\u0747', 0x1, 1);
1809 AddCharMap ('\u0748', 0x1, 1);
1810 // syriac dotted nonspacing marks (2)
1811 fillIndex [0x1] = 0x94; // this reset is mandatory
1812 AddCharMap ('\u0732', 0x1, 1);
1813 AddCharMap ('\u0735', 0x1, 1);
1814 AddCharMap ('\u0738', 0x1, 1);
1815 AddCharMap ('\u0739', 0x1, 1);
1816 AddCharMap ('\u073C', 0x1, 1);
1817 // SPECIAL CASES: superscripts
1818 AddCharMap ('\u073F', 0x1, 1);
1819 AddCharMap ('\u0711', 0x1, 1);
1821 for (int i = 0x0743; i <= 0x0746; i++)
1822 AddCharMap ((char) i, 0x1, 1);
1823 for (int i = 0x0730; i <= 0x0780; i++)
1824 if (!map [i].Defined &&
1825 Char.GetUnicodeCategory ((char) i) ==
1826 UnicodeCategory.NonSpacingMark)
1827 AddCharMap ((char) i, 0x1, 1);
1829 // LAMESPEC: It should not stop at '\u20E1'. There are
1830 // a few more characters (that however results in
1831 // overflow of level 2 unless we start before 0xDD).
1832 fillIndex [0x1] = 0xDD;
1833 for (int i = 0x20D0; i <= 0x20DC; i++)
1834 AddCharMap ((char) i, 0x1, 1);
1835 fillIndex [0x1] = 0xEC;
1836 for (int i = 0x20DD; i <= 0x20E1; i++)
1837 AddCharMap ((char) i, 0x1, 1);
1838 fillIndex [0x1] = 0x7;
1839 for (int i = 0x302A; i <= 0x302D; i++)
1840 AddCharMap ((char) i, 0x1, 1);
1841 fillIndex [0x1] = 0x50; // I wonder how they are sorted
1842 for (int i = 0x02D4; i <= 0x02D7; i++)
1843 AddCharMap ((char) i, 0x1, 1);
1845 // They are not part of Nonspacing marks, but have
1846 // only diacritical weight.
1847 for (int i = 0x3099; i <= 0x309C; i++)
1848 map [i] = new CharMapEntry (1, 1, 1);
1849 map [0xFF9E] = new CharMapEntry (1, 1, 1);
1850 map [0xFF9F] = new CharMapEntry (1, 1, 2);
1851 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1852 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1853 for (int i = 0x30FC; i <= 0x30FE; i++)
1854 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1859 #region Whitespaces // 07 03 -
1860 fillIndex [0x7] = 0x2;
1861 AddCharMap (' ', 0x7, 2);
1862 AddCharMap ('\u00A0', 0x7, 1);
1863 for (int i = 9; i <= 0xD; i++)
1864 AddCharMap ((char) i, 0x7, 1);
1865 for (int i = 0x2000; i <= 0x200B; i++)
1866 AddCharMap ((char) i, 0x7, 1);
1868 fillIndex [0x7] = 0x17;
1869 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1870 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1872 // Characters which used to represent layout control.
1873 // LAMESPEC: Windows developers seem to have thought
1874 // that those characters are kind of whitespaces,
1875 // while they aren't.
1876 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1877 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1880 // category 09 - continued symbols from 08
1881 fillIndex [0x9] = 2;
1883 for (int cp = 0x2300; cp <= 0x237A; cp++)
1884 AddCharMap ((char) cp, 0x9, 1, 0);
1887 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1888 foreach (DictionaryEntry de in arrowValues) {
1889 int idx = (int) de.Value;
1890 int cp = (int) de.Key;
1891 if (map [cp].Defined)
1893 fillIndex [0x9] = (byte) (0xD8 + idx);
1894 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1898 byte [] boxLv2 = new byte [128];
1899 for (int i = 0; i < boxLv2.Length; i++)
1901 foreach (DictionaryEntry de in boxValues) {
1902 int cp = (int) de.Key;
1903 int off = (int) de.Value;
1904 if (map [cp].Defined)
1907 fillIndex [0x9] = (byte) (0xE5 + off);
1908 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
1911 fillIndex [0x9] = (byte) (0xE5 + off);
1912 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1915 // Some special characters (slanted)
1916 fillIndex [0x9] = 0xF4;
1917 AddCharMap ('\u2571', 0x9, 3);
1918 AddCharMap ('\u2572', 0x9, 3);
1919 AddCharMap ('\u2573', 0x9, 3);
1921 // FIXME: implement 0A
1923 fillIndex [0xA] = 2;
1924 // byte currency symbols
1925 for (int cp = 0; cp < 0x100; cp++) {
1926 uc = Char.GetUnicodeCategory ((char) cp);
1927 if (!IsIgnorable (cp) &&
1928 uc == UnicodeCategory.CurrencySymbol &&
1931 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1933 // byte other symbols
1934 for (int cp = 0; cp < 0x100; cp++) {
1936 continue; // SPECIAL: skip FIXME: why?
1937 uc = Char.GetUnicodeCategory ((char) cp);
1938 if (!IsIgnorable (cp) &&
1939 uc == UnicodeCategory.OtherSymbol ||
1940 cp == '\u00B5' || cp == '\u00B7')
1941 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1944 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1946 for (int cp = 0x2020; cp <= 0x2031; cp++)
1947 if (Char.IsPunctuation ((char) cp))
1948 AddCharMap ((char) cp, 0xA, 1, 0);
1949 // SPECIAL CASES: why?
1950 AddCharMap ('\u203B', 0xA, 1, 0);
1951 AddCharMap ('\u2040', 0xA, 1, 0);
1952 AddCharMap ('\u2041', 0xA, 1, 0);
1953 AddCharMap ('\u2042', 0xA, 1, 0);
1955 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1956 AddCharMap ((char) cp, 0xA, 1, 0);
1957 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1958 for (int cp = 0x2600; cp <= 0x2613; cp++)
1959 AddCharMap ((char) cp, 0xA, 1, 0);
1961 for (int cp = 0x2620; cp <= 0x2770; cp++)
1962 if (Char.IsSymbol ((char) cp))
1963 AddCharMap ((char) cp, 0xA, 1, 0);
1965 for (int i = 0x2440; i < 0x2460; i++)
1966 AddCharMap ((char) i, 0xA, 1, 0);
1970 #region Numbers // 0C 02 - 0C E1
1971 fillIndex [0xC] = 2;
1973 // 9F8 : Bengali "one less than the denominator"
1974 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
1976 ArrayList numbers = new ArrayList ();
1977 for (int i = 0; i < 65536; i++)
1978 if (!IsIgnorable (i) &&
1979 Char.IsNumber ((char) i) &&
1980 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1983 ArrayList numberValues = new ArrayList ();
1984 foreach (int i in numbers)
1985 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1986 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1988 //foreach (DictionaryEntry de in numberValues)
1989 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1991 decimal prevValue = -1;
1992 foreach (DictionaryEntry de in numberValues) {
1993 int cp = (int) de.Key;
1994 decimal currValue = (decimal) de.Value;
1995 bool addnew = false;
1996 if (prevValue < currValue &&
1997 prevValue - (int) prevValue == 0 &&
2001 // Process Hangzhou and Roman numbers
2003 // There are some SPECIAL cases.
2004 if (currValue != 4) // no increment for 4
2008 if (currValue <= 10) {
2009 xcp = (int) prevValue + 0x2170 - 1;
2010 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2011 xcp = (int) prevValue + 0x2160 - 1;
2012 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2013 fillIndex [0xC] += 2;
2014 xcp = (int) prevValue + 0x3021 - 1;
2015 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2018 else if (currValue == 11)
2021 if (prevValue < currValue)
2022 prevValue = currValue;
2023 if (map [cp].Defined)
2025 // HangZhou and Roman are add later
2027 else if (0x3021 <= cp && cp < 0x302A
2028 || 0x2160 <= cp && cp < 0x216A
2029 || 0x2170 <= cp && cp < 0x217A)
2032 if (cp == 0x215B) // FIXME: why?
2033 fillIndex [0xC] += 2;
2034 else if (cp == 0x3021) // FIXME: why?
2036 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2037 if (addnew || cp <= '9') {
2038 int mod = (int) currValue - 1;
2040 if (1 <= currValue && currValue <= 10) {
2042 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2044 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2046 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2048 if (1 <= currValue && currValue <= 20) {
2050 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2052 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2054 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2058 if (cp != 0x09E7 && cp != 0x09EA)
2061 // Add special cases that are not regarded as
2062 // numbers in UnicodeCategory speak.
2065 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2066 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2068 else if (cp == '6') // FIXME: why?
2073 fillIndex [0xC] = 0xFF;
2074 AddCharMap ('\u221E', 0xC, 1);
2077 #region Letters and NonSpacing Marks (general)
2079 // ASCII Latin alphabets
2080 for (int i = 0; i < alphabets.Length; i++)
2081 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2083 // non-ASCII Latin alphabets
2084 // FIXME: there is no such characters that are placed
2085 // *after* "alphabets" array items. This is nothing
2086 // more than a hack that creates dummy weight for
2087 // primary characters.
2088 for (int i = 0x0080; i < 0x0300; i++) {
2089 if (!Char.IsLetter ((char) i))
2091 // For those Latin Letters which has NFKD are
2092 // not added as independent primary character.
2093 if (decompIndex [i] != 0)
2096 // 1.some alphabets have primarily
2097 // equivalent ASCII alphabets.
2098 // 2.some have independent primary weights,
2099 // but inside a-to-z range.
2100 // 3.there are some expanded characters that
2101 // are not part of Unicode Standard NFKD.
2102 // 4. some characters are letter in IsLetter
2103 // but not in sortkeys (maybe unicode version
2104 // difference caused it).
2106 // 1. skipping them does not make sense
2107 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2108 // case 0x184: case 0x185: case 0x186: case 0x189:
2109 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2110 // case 0x194: case 0x195: case 0x196: case 0x19A:
2111 // case 0x19B: case 0x19C:
2112 // 2. skipping them does not make sense
2113 // case 0x14A: // Ng
2114 // case 0x14B: // ng
2118 case 0xDE: // Icelandic Thorn
2119 case 0xFE: // Icelandic Thorn
2120 case 0xDF: // German ss
2121 case 0xFF: // German ss
2123 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2124 // not classified yet
2125 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2126 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2127 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2131 AddCharMapGroup ((char) i, 0xE, 1, 0);
2135 fillIndex [0xF] = 02;
2136 for (int i = 0x0380; i < 0x0390; i++)
2137 if (Char.IsLetter ((char) i))
2138 AddLetterMap ((char) i, 0xF, 1);
2139 fillIndex [0xF] = 02;
2140 for (int i = 0x0391; i < 0x03CF; i++)
2141 if (Char.IsLetter ((char) i))
2142 AddLetterMap ((char) i, 0xF, 1);
2143 fillIndex [0xF] = 0x40;
2144 for (int i = 0x03D0; i < 0x0400; i++)
2145 if (Char.IsLetter ((char) i))
2146 AddLetterMap ((char) i, 0xF, 1);
2149 // Cyrillic letters are sorted like Latin letters i.e.
2150 // containing culture-specific letters between the
2151 // standard Cyrillic sequence.
2153 // We can't use UCA here; it has different sorting.
2154 char [] orderedCyrillic = new char [] {
2155 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2156 '\u0452', // DJE for Serbocroatian
2158 '\u0454', // IE for Ukrainian
2162 '\u0456', // Byelorussian-Ukrainian I
2172 '\u043F', '\u0440', '\u0441', '\u0442',
2173 '\u045B', // TSHE for Serbocroatian
2175 '\u045E', // Short U for Byelorussian
2176 '\u04B1', // Straight U w/ stroke (diacritical!)
2177 '\u0444', '\u0445', '\u0446', '\u0447',
2179 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2180 '\u044D', '\u044E', '\u044F'};
2182 // For some characters here is a map to basic cyrillic
2183 // letters. See UnicodeData.txt character names for
2184 // the sources. Here I simply declare an equiv. array.
2185 // The content characters are map from U+490(,491),
2186 // skipping small letters.
2187 char [] cymap_src = new char [] {
2188 '\u0433', '\u0433', '\u0433', '\u0436',
2189 '\u0437', '\u043A', '\u043A', '\u043A',
2190 '\u043A', '\u043D', '\u043D', '\u043F',
2191 '\u0445', '\u0441', '\u0442', '\u0443',
2192 '\u0443', '\u0445', '\u0446', '\u0447',
2193 '\u0447', '\u0432', '\u0435', '\u0435',
2194 '\u0406', '\u0436', '\u043A', '\u043D',
2195 '\u0447', '\u0435'};
2197 fillIndex [0x10] = 0x8D;
2198 for (int i = 0x0460; i < 0x0481; i++) {
2199 if (Char.IsLetter ((char) i)) {
2201 // U+476/477 have the same
2202 // primary weight as U+474/475.
2203 fillIndex [0x10] -= 3;
2204 AddLetterMap ((char) i, 0x10, 3);
2208 fillIndex [0x10] = 0x6;
2209 for (int i = 0; i < orderedCyrillic.Length; i++) {
2210 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2211 if (!IsIgnorable ((int) c) &&
2212 Char.IsLetter (c) &&
2214 AddLetterMap (c, 0x10, 0);
2215 fillIndex [0x10] += 3;
2219 for (int i = 0; i < cymap_src.Length; i++) {
2220 char c = cymap_src [i];
2221 fillIndex [0x10] = map [c].Level1;
2222 int c2 = 0x0490 + i * 2;
2223 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2227 fillIndex [0x11] = 0x3;
2228 fillIndex [0x1] = 0x98;
2229 for (int i = 0x0531; i < 0x0586; i++) {
2230 if (i == 0x0559 || i == 0x55A)
2231 AddCharMap ((char) i, 1, 1);
2232 if (Char.IsLetter ((char) i))
2233 AddLetterMap ((char) i, 0x11, 1);
2238 fillIndex [0x12] = 0x2;
2239 for (int i = 0x05D0; i < 0x05FF; i++)
2240 if (Char.IsLetter ((char) i))
2241 AddLetterMap ((char) i, 0x12, 1);
2243 fillIndex [0x1] = 0x3;
2244 for (int i = 0x0591; i <= 0x05C2; i++) {
2245 if (i == 0x05A3 || i == 0x05BB)
2248 AddCharMap ((char) i, 0x1, 1);
2252 fillIndex [0x1] = 0x8E;
2253 fillIndex [0x13] = 0x3;
2254 for (int i = 0x0621; i <= 0x064A; i++) {
2256 if (Char.GetUnicodeCategory ((char) i)
2257 != UnicodeCategory.OtherLetter) {
2258 // FIXME: arabic nonspacing marks are
2259 // in different order.
2260 AddCharMap ((char) i, 0x1, 1);
2263 // map [i] = new CharMapEntry (0x13,
2264 // (byte) arabicLetterPrimaryValues [i], 1);
2266 (byte) arabicLetterPrimaryValues [i];
2267 byte formDiacritical = 8; // default
2270 case 0x0622: formDiacritical = 9; break;
2271 case 0x0623: formDiacritical = 0xA; break;
2272 case 0x0624: formDiacritical = 5; break;
2273 case 0x0625: formDiacritical = 0xB; break;
2274 case 0x0626: formDiacritical = 7; break;
2275 case 0x0649: formDiacritical = 5; break;
2276 case 0x064A: formDiacritical = 7; break;
2278 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2280 for (int i = 0x0670; i < 0x0673; i++)
2281 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2282 fillIndex [0x13] = 0x84;
2283 for (int i = 0x0674; i < 0x06D6; i++)
2284 if (Char.IsLetter ((char) i))
2285 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2288 // FIXME: it does seem straight codepoint mapping.
2289 fillIndex [0x14] = 04;
2290 for (int i = 0x0901; i < 0x0905; i++)
2291 if (!IsIgnorable (i))
2292 AddLetterMap ((char) i, 0x14, 2);
2293 fillIndex [0x14] = 0xB;
2294 for (int i = 0x0905; i < 0x093A; i++) {
2296 AddCharMap ('\u0929', 0x14, 0, 8);
2298 AddCharMap ('\u0931', 0x14, 0, 8);
2300 AddCharMap ('\u0934', 0x14, 0, 8);
2301 if (Char.IsLetter ((char) i))
2302 AddLetterMap ((char) i, 0x14, 4);
2304 AddCharMap ('\u0960', 0x14, 4);
2306 AddCharMap ('\u0961', 0x14, 4);
2308 fillIndex [0x14] = 0xDA;
2309 for (int i = 0x093E; i < 0x0945; i++)
2310 if (!IsIgnorable (i))
2311 AddLetterMap ((char) i, 0x14, 2);
2312 fillIndex [0x14] = 0xEC;
2313 for (int i = 0x0945; i < 0x094F; i++)
2314 if (!IsIgnorable (i))
2315 AddLetterMap ((char) i, 0x14, 2);
2319 fillIndex [0x15] = 02;
2320 for (int i = 0x0980; i < 0x9FF; i++) {
2321 if (IsIgnorable (i))
2324 fillIndex [0x15] = 0x3B;
2325 switch (Char.GetUnicodeCategory ((char) i)) {
2326 case UnicodeCategory.NonSpacingMark:
2327 case UnicodeCategory.DecimalDigitNumber:
2328 case UnicodeCategory.OtherNumber:
2331 AddLetterMap ((char) i, 0x15, 1);
2334 fillIndex [0x1] = 0x3;
2335 for (int i = 0x0981; i < 0x0A00; i++)
2336 if (Char.GetUnicodeCategory ((char) i) ==
2337 UnicodeCategory.NonSpacingMark)
2338 AddCharMap ((char) i, 0x1, 1);
2340 // Gurmukhi. orderedGurmukhi is from UCA
2341 // FIXME: it does not look equivalent to UCA.
2342 fillIndex [0x16] = 04;
2343 fillIndex [0x1] = 3;
2344 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2345 char c = orderedGurmukhi [i];
2346 if (IsIgnorable ((int) c))
2348 if (IsIgnorableNonSpacing (c)) {
2349 AddLetterMap (c, 0x1, 1);
2352 if (c == '\u0A3C' || c == '\u0A4D' ||
2353 '\u0A66' <= c && c <= '\u0A71')
2358 case '\u0A33': case '\u0A36': case '\u0A16':
2359 case '\u0A17': case '\u0A5B': case '\u0A5E':
2363 if (c == '\u0A3E') // Skip
2364 fillIndex [0x16] = 0xC0;
2365 AddLetterMap (c, 0x16, shift);
2368 // Gujarati. orderedGujarati is from UCA
2369 fillIndex [0x17] = 0x4;
2371 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2372 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2373 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2374 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2375 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2376 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2377 // letters go first.
2378 for (int i = 0; i < orderedGujarati.Length; i++) {
2380 char c = orderedGujarati [i];
2381 if (Char.IsLetter (c)) {
2383 if (c == '\u0AB3' || c == '\u0A32')
2385 if (c == '\u0A33') {
2386 AddCharMap ('\u0A32', 0x17, 0);
2387 AddCharMap ('\u0A33', 0x17, 4, 4);
2391 AddCharMap ('\u0AE0', 0x17, 0, 5);
2392 AddCharMap (c, 0x17, 4);
2395 AddCharMap ('\u0AB3', 0x17, 6);
2399 byte gujaratiShift = 4;
2400 fillIndex [0x17] = 0xC0;
2401 for (int i = 0; i < orderedGujarati.Length; i++) {
2402 char c = orderedGujarati [i];
2403 if (fillIndex [0x17] == 0xCC)
2405 if (!Char.IsLetter (c)) {
2408 AddCharMap ('\u0A81', 0x17, 2);
2411 AddLetterMap (c, 0x17, gujaratiShift);
2416 fillIndex [0x1] = 03;
2417 fillIndex [0x18] = 02;
2418 for (int i = 0x0B00; i < 0x0B7F; i++) {
2419 switch (Char.GetUnicodeCategory ((char) i)) {
2420 case UnicodeCategory.NonSpacingMark:
2421 case UnicodeCategory.DecimalDigitNumber:
2422 AddLetterMap ((char) i, 0x1, 1);
2425 AddLetterMap ((char) i, 0x18, 1);
2429 fillIndex [0x19] = 2;
2430 AddCharMap ('\u0BD7', 0x19, 0);
2431 fillIndex [0x19] = 0xA;
2433 for (int i = 0x0B82; i <= 0x0B94; i++)
2434 if (!IsIgnorable ((char) i))
2435 AddCharMap ((char) i, 0x19, 2);
2437 fillIndex [0x19] = 0x28;
2438 // The array for Tamil consonants is a constant.
2439 // Windows have almost similar sequence to TAM from
2440 // tamilnet but a bit different in Grantha.
2441 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2442 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2444 fillIndex [0x19] = 0x82;
2445 for (int i = 0x0BBE; i < 0x0BCD; i++)
2446 if (Char.GetUnicodeCategory ((char) i) ==
2447 UnicodeCategory.SpacingCombiningMark
2449 AddLetterMap ((char) i, 0x19, 2);
2452 fillIndex [0x1A] = 0x4;
2453 for (int i = 0x0C00; i < 0x0C62; i++) {
2454 if (i == 0x0C55 || i == 0x0C56)
2456 AddCharMap ((char) i, 0x1A, 3);
2457 char supp = (i == 0x0C0B) ? '\u0C60':
2458 i == 0x0C0C ? '\u0C61' : char.MinValue;
2459 if (supp == char.MinValue)
2461 AddCharMap (supp, 0x1A, 3);
2465 fillIndex [0x1B] = 4;
2466 for (int i = 0x0C80; i < 0x0CE5; i++) {
2467 if (i == 0x0CD5 || i == 0x0CD6)
2469 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2470 continue; // shift after 0xCB9
2471 AddCharMap ((char) i, 0x1B, 3);
2473 // SPECIAL CASES: but why?
2474 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2475 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2476 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2479 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2483 fillIndex [0x1C] = 2;
2484 fillIndex [0x1] = 3;
2485 for (int i = 0x0D02; i < 0x0D61; i++) {
2486 // FIXME: I avoided MSCompatUnicodeTable usage
2487 // here (it results in recursion). So check if
2488 // using NonSpacingMark makes sense or not.
2489 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2490 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2491 AddCharMap ((char) i, 0x1C, 1);
2492 else if (!IsIgnorable ((char) i))
2493 AddCharMap ((char) i, 1, 1);
2496 // Thai ... note that it breaks 0x1E wall after E2B!
2497 // Also, all Thai characters have level 2 value 3.
2498 fillIndex [0x1E] = 2;
2499 fillIndex [0x1] = 3;
2500 for (int i = 0xE40; i <= 0xE44; i++)
2501 AddCharMap ((char) i, 0x1E, 1, 3);
2502 for (int i = 0xE01; i < 0xE2B; i++)
2503 AddCharMap ((char) i, 0x1E, 6, 3);
2504 fillIndex [0x1F] = 5;
2505 for (int i = 0xE2B; i < 0xE30; i++)
2506 AddCharMap ((char) i, 0x1F, 6, 3);
2507 fillIndex [0x1F] = 0x1E;
2508 for (int i = 0xE30; i < 0xE3B; i++)
2509 AddCharMap ((char) i, 0x1F, 1, 3);
2510 // some Thai characters remains.
2511 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2512 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2513 foreach (char c in specialThai)
2514 AddCharMap (c, 0x1F, 1, 3);
2516 for (int i = 0xE00; i < 0xE80; i++)
2517 if (Char.GetUnicodeCategory ((char) i) ==
2518 UnicodeCategory.NonSpacingMark)
2519 AddCharMap ((char) i, 1, 1);
2522 fillIndex [0x1F] = 2;
2523 fillIndex [0x1] = 3;
2524 for (int i = 0xE80; i < 0xEDF; i++) {
2525 if (IsIgnorable ((char) i))
2527 else if (Char.IsLetter ((char) i))
2528 AddCharMap ((char) i, 0x1F, 1);
2529 else if (Char.GetUnicodeCategory ((char) i) ==
2530 UnicodeCategory.NonSpacingMark)
2531 AddCharMap ((char) i, 1, 1);
2534 // Georgian. orderedGeorgian is from UCA DUCET.
2535 fillIndex [0x21] = 5;
2536 for (int i = 0; i < orderedGeorgian.Length; i++) {
2537 char c = orderedGeorgian [i];
2538 if (map [(int) c].Defined)
2540 AddCharMap (c, 0x21, 0);
2542 AddCharMap ((char) (c - 0x30), 0x21, 0);
2543 fillIndex [0x21] += 5;
2547 fillIndex [0x22] = 2;
2548 int kanaOffset = 0x3041;
2549 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2551 for (int gyo = 0; gyo < 9; gyo++) {
2552 for (int dan = 0; dan < 5; dan++) {
2553 if (gyo == 7 && dan % 2 == 1) {
2556 kanaOffset -= 2; // There is no space for yi and ye.
2559 int cp = kanaOffset + dan * kanaLines [gyo];
2560 // small lines (a-gyo, ya-gyo)
2561 if (gyo == 0 || gyo == 7) {
2562 AddKanaMap (cp, 1); // small
2563 AddKanaMap (cp + 1, 1);
2566 AddKanaMap (cp, kanaLines [gyo]);
2570 // add small 'ka' (before normal one)
2571 AddKanaMap (0x30F5, 1);
2575 // add small 'ke' (before normal one)
2576 AddKanaMap (0x30F6, 1);
2580 // add small 'Tsu' (before normal one)
2581 AddKanaMap (0x3063, 1);
2585 fillIndex [0x22] += 3;
2586 kanaOffset += 5 * kanaLines [gyo];
2589 // Wa-gyo is almost special, so I just manually add.
2590 AddLetterMap ((char) 0x308E, 0x22, 0);
2591 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2592 AddLetterMap ((char) 0x308F, 0x22, 0);
2593 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2595 AddLetterMap ((char) 0x3090, 0x22, 0);
2596 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2597 fillIndex [0x22] += 2;
2598 // no "Wu" in Japanese.
2599 AddLetterMap ((char) 0x3091, 0x22, 0);
2600 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2602 AddLetterMap ((char) 0x3092, 0x22, 0);
2603 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2605 fillIndex [0x22] = 0x80;
2606 AddLetterMap ((char) 0x3093, 0x22, 0);
2607 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2609 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2610 map [0x30A6].Level1, 3);// voiced hiragana U
2611 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2612 map [0x30A6].Level1, 3);// voiced katakana U
2614 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2615 map [0x30AB].Level1, 0);// small katakana Ka
2616 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2617 map [0x30B1].Level1, 0);// small katakana Ke
2619 for (int i = 0x30F7; i < 0x30FB; i++)
2620 map [i] = new CharMapEntry (map [i - 8].Category,
2624 // JIS Japanese square chars.
2625 fillIndex [0x22] = 0x97;
2626 jisJapanese.Sort (JISComparer.Instance);
2627 foreach (JISCharacter j in jisJapanese)
2628 if (0x3300 <= j.CP && j.CP <= 0x3357)
2629 AddCharMap ((char) j.CP, 0x22, 1);
2630 // non-JIS Japanese square chars.
2631 nonJisJapanese.Sort (NonJISComparer.Instance);
2632 foreach (NonJISCharacter j in nonJisJapanese)
2633 AddCharMap ((char) j.CP, 0x22, 1);
2636 fillIndex [0x23] = 0x02;
2637 for (int i = 0x3105; i <= 0x312C; i++)
2638 AddCharMap ((char) i, 0x23, 1);
2640 // Estrangela: ancient Syriac
2641 fillIndex [0x24] = 0x0B;
2642 // FIXME: is 0x71E really alternative form?
2643 ArrayList syriacAlternatives = new ArrayList (
2644 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2645 for (int i = 0x0710; i <= 0x072C; i++) {
2646 if (i == 0x0711) // NonSpacingMark
2648 if (syriacAlternatives.Contains (i))
2650 AddCharMap ((char) i, 0x24, 4);
2655 foreach (int cp in syriacAlternatives)
2656 map [cp] = new CharMapEntry (0x24,
2657 (byte) (map [cp - 1].Level1 + 2),
2659 // FIXME: Syriac NonSpacingMark should go here.
2662 // FIXME: it turned out that it does not look like UCA
2663 fillIndex [0x24] = 0x6E;
2664 fillIndex [0x1] = 0xAC;
2665 for (int i = 0; i < orderedThaana.Length; i++) {
2666 char c = orderedThaana [i];
2667 if (IsIgnorableNonSpacing ((int) c))
2668 AddCharMap (c, 1, 1);
2669 AddCharMap (c, 0x24, 2);
2670 if (c == '\u0782') // SPECIAL CASE: why?
2671 fillIndex [0x24] += 2;
2675 // FIXME: Add more culture-specific letters (that are
2676 // not supported in Windows collation) here.
2678 // Surrogate ... they are computed.
2683 // Unlike UCA Windows Hangul sequence mixes Jongseong
2684 // with Choseong sequence as well as Jungseong,
2685 // adjusted to have the same primary weight for the
2686 // same base character. So it is impossible to compute
2689 // Here I introduce an ordered sequence of mixed
2690 // 'commands' and 'characters' that is similar to
2692 // - ',' increases primary weight.
2693 // - [A B] means a range, increasing index
2694 // - {A B} means a range, without increasing index
2695 // - '=' is no operation (it means the characters
2696 // of both sides have the same weight).
2697 // - '>' inserts a Hangul Syllable block that
2698 // contains 0x251 characters.
2699 // - '<' decreases the index
2700 // - '0'-'9' means skip count
2701 // - whitespaces are ignored
2704 string hangulSequence =
2705 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2706 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2707 + "<{\u1113 \u1116}, \u3165,"
2708 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2709 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2710 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2711 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2712 + "[\u11D1 \u11D2], \u11B2,"
2713 + "[\u11D3 \u11D5], \u11B3,"
2714 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2715 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2716 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2717 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2718 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2719 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2720 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2721 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2722 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2723 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2724 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2725 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2726 + "\u11F1,, \u11F2,,,"
2727 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2728 + "<\u114D, \u110D,, >"
2729 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2730 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2731 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2732 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2733 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2737 byte hangulCat = 0x52;
2738 fillIndex [hangulCat] = 0x2;
2740 int syllableBlock = 0;
2741 for (int n = 0; n < hangulSequence.Length; n++) {
2742 char c = hangulSequence [n];
2744 if (Char.IsWhiteSpace (c))
2750 IncrementSequentialIndex (ref hangulCat);
2753 if (fillIndex [hangulCat] == 2)
2754 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2755 fillIndex [hangulCat]--;
2758 IncrementSequentialIndex (ref hangulCat);
2759 for (int l = 0; l < 0x15; l++)
2760 for (int v = 0; v < 0x1C; v++) {
2762 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2763 IncrementSequentialIndex (ref hangulCat);
2768 start = hangulSequence [n + 1];
2769 end = hangulSequence [n + 3];
2770 for (int i = start; i <= end; i++) {
2771 AddCharMap ((char) i, hangulCat, 0);
2773 IncrementSequentialIndex (ref hangulCat);
2775 n += 4; // consumes 5 characters for this operation
2778 start = hangulSequence [n + 1];
2779 end = hangulSequence [n + 3];
2780 for (int i = start; i <= end; i++)
2781 AddCharMap ((char) i, hangulCat, 0);
2782 n += 4; // consumes 5 characters for this operation
2785 AddCharMap (c, hangulCat, 0);
2791 for (int i = 0x3200; i < 0x3300; i++) {
2792 if (IsIgnorable (i) || map [i].Defined)
2796 if (decompLength [i] == 4 &&
2797 decompValues [decompIndex [i]] == '(')
2798 ch = decompIndex [i] + 1;
2800 else if (decompLength [i] == 2 &&
2801 decompValues [decompIndex [i] + 1] == '\u1161')
2802 ch = decompIndex [i];
2803 else if (decompLength [i] == 1)
2804 ch = decompIndex [i];
2807 ch = decompValues [ch];
2808 if (ch < 0x1100 || 0x1200 < ch &&
2809 ch < 0xAC00 || 0xD800 < ch)
2813 int offset = i < 0x3260 ? 1 : 0;
2814 if (0x326E <= i && i <= 0x3273)
2817 map [i] = new CharMapEntry (map [ch].Category,
2818 (byte) (map [ch].Level1 + offset),
2820 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2826 // Letterlike characters and CJK compatibility square
2827 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2828 int [] counts = new int ['Z' - 'A' + 1];
2829 char [] namedChars = new char [sortableCharNames.Count];
2831 foreach (DictionaryEntry de in sortableCharNames) {
2832 counts [((string) de.Value) [0] - 'A']++;
2833 namedChars [nCharNames++] = (char) ((int) de.Key);
2835 nCharNames = 0; // reset
2836 for (int a = 0; a < counts.Length; a++) {
2837 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2838 for (int i = 0; i < counts [a]; i++)
2839 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2840 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2843 // CJK unified ideograph.
2845 fillIndex [cjkCat] = 0x2;
2846 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2847 if (!IsIgnorable (cp))
2848 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2849 // CJK Extensions goes here.
2850 // LAMESPEC: With this Windows style CJK layout, it is
2851 // impossible to add more CJK ideograph i.e. 0x9FA6-
2852 // 0x9FBB can never be added w/o breaking compat.
2853 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2854 if (!IsIgnorable (cp))
2855 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2857 // PrivateUse ... computed.
2858 // remaining Surrogate ... computed.
2860 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2861 // non-alphanumeric ASCII except for: + - < = > '
2862 for (int i = 0x21; i < 0x7F; i++) {
2863 if (Char.IsLetterOrDigit ((char) i)
2864 || "+-<=>'".IndexOf ((char) i) >= 0)
2865 continue; // they are not added here.
2866 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2867 // Insert 3001 after ',' and 3002 after '.'
2869 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2871 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2873 AddCharMap ('\uFE30', 0x7, 1, 0);
2877 #region 07 - Punctuations and something else
2878 for (int i = 0xA0; i < char.MaxValue; i++) {
2879 if (IsIgnorable (i))
2882 // FIXME: actually those reset should not be
2883 // done but here I put for easy goal.
2885 fillIndex [0x7] = 0xE2;
2887 fillIndex [0x7] = 0x77;
2901 switch (Char.GetUnicodeCategory ((char) i)) {
2902 case UnicodeCategory.OtherPunctuation:
2903 case UnicodeCategory.ClosePunctuation:
2904 case UnicodeCategory.OpenPunctuation:
2905 case UnicodeCategory.ConnectorPunctuation:
2906 case UnicodeCategory.InitialQuotePunctuation:
2907 case UnicodeCategory.FinalQuotePunctuation:
2908 case UnicodeCategory.ModifierSymbol:
2909 // SPECIAL CASES: // 0xA
2910 if (0x2020 <= i && i <= 0x2031)
2912 AddCharMapGroup ((char) i, 0x7, 1, 0);
2915 if (i == 0xA6 || i == 0x1C3) // SPECIAL CASE. FIXME: why?
2916 goto case UnicodeCategory.OtherPunctuation;
2921 // FIXME: it should not need to reset level 1, but
2922 // it's for easy goal.
2923 fillIndex [0x7] = 0xB6;
2924 for (int i = 0x2400; i <= 0x2421; i++)
2925 AddCharMap ((char) i, 0x7, 1, 0);
2927 // Actually 3008-301F and FE33-FE5D are mixed, so
2928 // it's somewhat countable, but not as a whole. Thus
2929 // manual remapping is quicker.
2930 fillIndex [0x7] = 0x8D;
2931 int [] cjkCompatMarks1 = new int [] {
2932 0xFE33, 0xFE49, 0xFE4A, 0xFE4B, 0xFE4C};
2933 int [] cjkCompatMarks2 = new int [] {
2934 0xFE34, 0xFE3F, 0xFE40, 0xFE3D, 0xFE3E, 0xFE41,
2935 0xFE42, 0xFE43, 0xFE44, 0xFE3B, 0xFE3C/*FE5D*/,
2936 0xFE39/*FE5E*/, 0xFE3A};
2937 for (int i = 0; i < cjkCompatMarks1.Length; i++)
2938 map [cjkCompatMarks1 [i]] = new CharMapEntry (
2939 0x7, fillIndex [0x7]++, 0);
2940 for (int i = 0; i < cjkCompatMarks2.Length; i++) {
2941 map [cjkCompatMarks2 [i]] = new CharMapEntry (
2942 0x7, fillIndex [0x7], 0);
2943 fillIndex [0x7] += 2;
2944 switch (cjkCompatMarks2 [i]) {
2946 map [0xFE5D] = new CharMapEntry (
2947 0x7, fillIndex [0x7]++, 0);
2950 map [0xFE5D] = new CharMapEntry (
2951 0x7, fillIndex [0x7]++, 0);
2956 fillIndex [0x7] = 0x93;
2957 for (int i = 0x3008; i <= 0x3011; i++) {
2958 map [i] = new CharMapEntry (0x7,
2959 fillIndex [0x7], 0);
2960 fillIndex [0x7] += 2;
2962 fillIndex [0x7] += 3;
2963 map [0x3014] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2964 fillIndex [0x7] += 3;
2965 map [0x3015] = new CharMapEntry (0x7, fillIndex [0x7], 0);
2966 fillIndex [0x7] += 2;
2967 for (int i = 0x3016; i < 0x301F; i++)
2968 map [i] = new CharMapEntry (0x7,
2969 fillIndex [0x7]++, 0);
2973 // FIXME: for 07 xx we need more love.
2975 // Characters w/ diacritical marks (NFKD)
2976 for (int i = 0; i <= char.MaxValue; i++) {
2977 if (map [i].Defined || IsIgnorable (i))
2979 if (decompIndex [i] == 0)
2982 int start = decompIndex [i];
2983 int primaryChar = decompValues [start];
2986 int length = decompLength [i];
2987 // special processing for parenthesized ones.
2989 decompValues [start] == '(' &&
2990 decompValues [start + 2] == ')') {
2991 primaryChar = decompValues [start + 1];
2995 if (map [primaryChar].Level1 == 0)
2998 for (int l = 1; l < length; l++) {
2999 int c = decompValues [start + l];
3000 if (map [c].Level1 != 0)
3002 secondary += diacritical [c];
3006 map [i] = new CharMapEntry (
3007 map [primaryChar].Category,
3008 map [primaryChar].Level1,
3013 // category 08 - symbols
3014 fillIndex [0x8] = 2;
3015 // Here Windows mapping is not straightforward. It is
3016 // not based on computation but seems manual sorting.
3017 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3018 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3019 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3020 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3021 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3022 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3023 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3024 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3025 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3026 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3027 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3028 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3029 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3031 for (int cp = 0; cp < 0x2300; cp++) {
3032 if (cp == 0xAC) // SPECIAL CASE: skip
3035 cp = 0x2200; // skip to 2200
3036 fillIndex [0x8] = 0x21;
3039 fillIndex [0x8] = 0x3;
3041 fillIndex [0x8] = 0xB9;
3042 if (!map [cp].Defined &&
3043 // Char.GetUnicodeCategory ((char) cp) ==
3044 // UnicodeCategory.MathSymbol)
3045 Char.IsSymbol ((char) cp))
3046 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3047 // SPECIAL CASES: no idea why Windows sorts as such
3050 AddCharMap ('\u227B', 0x8, 1, 0);
3051 AddCharMap ('\u22B1', 0x8, 1, 0);
3054 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3055 AddCharMapGroup ('\u226A', 0x8, 1, 0);
3056 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3057 AddCharMapGroup ('\u226B', 0x8, 1, 0);
3060 AddCharMap ('\u01C0', 0x8, 1, 0);
3061 AddCharMap ('\u01C1', 0x8, 1, 0);
3062 AddCharMap ('\u01C2', 0x8, 1, 0);
3067 #region Level2 adjustment
3069 diacritical [0x624] = 0x5;
3070 diacritical [0x626] = 0x7;
3071 diacritical [0x622] = 0x9;
3072 diacritical [0x623] = 0xA;
3073 diacritical [0x625] = 0xB;
3074 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3075 diacritical [0x64A] = 0x7; // Yaa'
3077 for (int i = 0; i < char.MaxValue; i++) {
3079 byte cat = map [i].Category;
3081 case 0xE: // Latin diacritics
3082 case 0x22: // Japanese: circled characters
3083 mod = diacritical [i];
3085 case 0x13: // Arabic
3086 if (diacritical [i] == 0 && i >= 0xFE8D)
3087 mod = 0x8; // default for arabic
3090 if (0x52 <= cat && cat <= 0x7F) // Hangul
3091 mod = diacritical [i];
3093 map [i] = new CharMapEntry (
3094 cat, map [i].Level1, mod);
3098 // FIXME: this is halfly hack but those NonSpacingMark
3099 // characters and still undefined are likely to
3101 for (int i = 0; i < char.MaxValue; i++) {
3102 if (map [i].Defined ||
3111 if (Char.GetUnicodeCategory ((char) i) !=
3112 UnicodeCategory.NonSpacingMark)
3116 if (diacritical [i] != 0)
3117 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3119 AddCharMap ((char) i, 1, 1);
3122 // FIXME: this is hack but those Symbol characters
3123 // are likely to fall into 0xA category.
3124 for (int i = 0; i < char.MaxValue; i++)
3125 if (!map [i].Defined &&
3127 Char.IsSymbol ((char) i))
3128 AddCharMap ((char) i, 0xA, 1);
3131 private void IncrementSequentialIndex (ref byte hangulCat)
3133 fillIndex [hangulCat]++;
3134 if (fillIndex [hangulCat] == 0) { // overflown
3136 fillIndex [hangulCat] = 0x2;
3140 // Reset fillIndex to fixed value and call AddLetterMap().
3141 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3143 fillIndex [category] = alphaWeight;
3144 AddLetterMap (c, category, 0);
3146 ArrayList al = latinMap [c] as ArrayList;
3150 foreach (int cp in al)
3151 AddLetterMap ((char) cp, category, 0);
3154 private void AddKanaMap (int i, byte voices)
3156 for (byte b = 0; b < voices; b++) {
3157 char c = (char) (i + b);
3158 byte arg = (byte) (b > 0 ? b + 2 : 0);
3160 AddLetterMapCore (c, 0x22, 0, arg, false);
3162 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3166 private void AddLetterMap (char c, byte category, byte updateCount)
3168 AddLetterMapCore (c, category, updateCount, 0, true);
3171 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3174 // <small> updates index
3175 c2 = ToSmallForm (c);
3177 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3178 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3179 if (c2 != c && !map [(int) c2].Defined)
3180 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3181 bool doUpdate = true;
3182 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3185 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3187 fillIndex [category] += updateCount;
3190 private bool AddCharMap (char c, byte category, byte increment)
3192 return AddCharMap (c, category, increment, 0);
3195 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3197 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3198 return false; // do nothing
3199 map [(int) c] = new CharMapEntry (category,
3200 category == 1 ? alt : fillIndex [category],
3201 category == 1 ? fillIndex [category] : alt);
3202 fillIndex [category] += increment;
3207 // Adds characters to table in the order below
3208 // (+ increases weight):
3212 // <full> | <super> | <sub>
3213 // <circle> | <wide> (| <narrow>)
3217 // level2 is fixed (does not increase).
3218 int [] sameWeightItems = new int [] {
3219 DecompositionFraction,
3223 DecompositionCircle,
3225 DecompositionNarrow,
3227 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3229 AddCharMapGroup (c, category, updateCount, level2, false);
3232 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3234 if (map [(int) c].Defined)
3238 level2 = diacritical [(int) c];
3240 char small = char.MinValue;
3241 char vertical = char.MinValue;
3242 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3244 object smv = nfkd [(byte) DecompositionSmall];
3246 small = (char) ((int) smv);
3247 object vv = nfkd [(byte) DecompositionVertical];
3249 vertical = (char) ((int) vv);
3252 // <small> updates index
3253 if (small != char.MinValue) {
3254 if (level2 == 0 && deferLevel2)
3255 level2 = diacritical [small];
3256 AddCharMap (small, category, updateCount, level2);
3260 AddCharMap (c, category, 0, level2);
3263 foreach (int weight in sameWeightItems) {
3264 object wv = nfkd [(byte) weight];
3267 level2 = diacritical [(int) wv];
3268 AddCharMap ((char) ((int) wv), category, 0, level2);
3273 // update index here.
3274 fillIndex [category] += updateCount;
3276 if (vertical != char.MinValue) {
3277 if (level2 == 0 && deferLevel2)
3278 level2 = diacritical [vertical];
3279 AddCharMap (vertical, category, updateCount, level2);
3283 private void AddCharMapCJK (char c, ref byte category)
3285 AddCharMap (c, category, 0, 0);
3286 IncrementSequentialIndex (ref category);
3288 // Special. I wonder why but Windows skips 9E F9.
3289 if (category == 0x9E && fillIndex [category] == 0xF9)
3290 IncrementSequentialIndex (ref category);
3293 private void AddCharMapGroupCJK (char c, ref byte category)
3295 AddCharMapCJK (c, ref category);
3297 // LAMESPEC: see below.
3298 if (c == '\u5B78') {
3299 AddCharMapCJK ('\u32AB', ref category);
3300 AddCharMapCJK ('\u323B', ref category);
3302 if (c == '\u52DE') {
3303 AddCharMapCJK ('\u3298', ref category);
3304 AddCharMapCJK ('\u3238', ref category);
3307 AddCharMapCJK ('\u32A2', ref category);
3309 // Especially this mapping order totally does
3310 // not make sense to me.
3311 AddCharMapCJK ('\u32A9', ref category);
3313 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3316 for (byte weight = 0; weight <= 0x12; weight++) {
3317 object wv = nfkd [weight];
3322 // Special: they are ignored in this area.
3323 // FIXME: check if it is sane
3324 if (0xF900 <= w && w <= 0xFAD9)
3326 // LAMESPEC: on Windows some of CJK characters
3327 // in 3200-32B0 are incorrectly mapped. They
3328 // mix Chinise and Japanese Kanji when
3329 // ordering those characters.
3331 case 0x32A2: case 0x3298: case 0x3238:
3332 case 0x32A9: case 0x323B: case 0x32AB:
3336 AddCharMapCJK ((char) w, ref category);
3340 // For now it is only for 0x7 category.
3341 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3343 char small = char.MinValue;
3344 char vertical = char.MinValue;
3345 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3347 object smv = nfkd [(byte) DecompositionSmall];
3349 small = (char) ((int) smv);
3350 object vv = nfkd [(byte) DecompositionVertical];
3352 vertical = (char) ((int) vv);
3355 // <small> updates index
3356 if (small != char.MinValue)
3357 // SPECIAL CASE excluded (FIXME: why?)
3358 if (small != '\u2024')
3359 AddCharMap (small, category, updateCount);
3362 AddCharMap (c, category, updateCount, level2);
3364 // Since nfkdMap is problematic to have two or more
3365 // NFKD to an identical character, here I iterate all.
3366 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3367 if (decompLength [c2] == 1 &&
3368 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3369 switch (decompType [c2]) {
3370 case DecompositionCompat:
3371 AddCharMap ((char) c2, category, updateCount, level2);
3377 if (vertical != char.MinValue)
3378 // SPECIAL CASE excluded (FIXME: why?)
3379 if (vertical != '\uFE33' && vertical != '\uFE34')
3380 AddCharMap (vertical, category, updateCount, level2);
3383 private void AddArabicCharMap (char c)
3386 byte updateCount = 1;
3390 AddCharMap (c, category, 0, level2);
3392 // Since nfkdMap is problematic to have two or more
3393 // NFKD to an identical character, here I iterate all.
3394 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3395 if (decompLength [c2] == 0)
3397 int idx = decompIndex [c2] + decompLength [c2] - 1;
3398 if ((int) (decompValues [idx]) == (int) c)
3399 AddCharMap ((char) c2, category,
3402 fillIndex [category] += updateCount;
3405 char ToSmallForm (char c)
3407 return ToDecomposed (c, DecompositionSmall, false);
3410 char ToDecomposed (char c, byte d, bool tail)
3412 if (decompType [(int) c] != d)
3414 int idx = decompIndex [(int) c];
3416 idx += decompLength [(int) c] - 1;
3417 return (char) decompValues [idx];
3420 bool ExistsJIS (int cp)
3422 foreach (JISCharacter j in jisJapanese)
3430 #region Level 3 properties (Case/Width)
3432 private byte ComputeLevel3Weight (char c)
3434 byte b = ComputeLevel3WeightRaw (c);
3435 return b > 0 ? (byte) (b + 2) : b;
3438 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3441 if ('\u3192' <= c && c <= '\u319F')
3444 // They have <narrow> NFKD mapping, and on Windows
3445 // those narrow characters are regarded as "normal",
3446 // thus those characters themselves are regarded as
3447 // "wide". grep "<narrow>" and you can pick them up
3448 // (ignoring Kana, Hangul etc.)
3465 if ('\u11A8' <= c && c <= '\u11F9')
3467 if ('\uFFA0' <= c && c <= '\uFFDC')
3469 if ('\u3130' <= c && c <= '\u3164')
3471 if ('\u3165' <= c && c <= '\u318E')
3473 // Georgian Capital letters
3474 if ('\u10A0' <= c && c <= '\u10C5')
3477 if ('\u2776' <= c && c <= '\u277F')
3479 if ('\u2780' <= c && c <= '\u2789')
3481 if ('\u2776' <= c && c <= '\u2793')
3483 if ('\u2160' <= c && c <= '\u216F')
3485 if ('\u2181' <= c && c <= '\u2182')
3488 if ('\u2135' <= c && c <= '\u2138')
3490 if ('\uFE80' <= c && c < '\uFF00') {
3491 // 2(Isolated)/8(Final)/0x18(Medial)
3492 switch (decompType [(int) c]) {
3493 case DecompositionIsolated:
3495 case DecompositionFinal:
3497 case DecompositionMedial:
3502 // actually I dunno the reason why they have weights.
3532 switch (decompType [(int) c]) {
3533 case DecompositionWide: // <wide>
3534 case DecompositionSub: // <sub>
3535 case DecompositionSuper: // <super>
3536 ret |= decompType [(int) c];
3539 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3541 if (isUppercase [(int) c]) // DerivedCoreProperties
3551 static bool IsIgnorable (int i)
3553 if (unicodeAge [i] >= 3.1)
3555 switch (char.GetUnicodeCategory ((char) i)) {
3556 case UnicodeCategory.OtherNotAssigned:
3557 case UnicodeCategory.Format:
3564 // FIXME: In the future use DerivedAge.txt to examine character
3565 // versions and set those ones that have higher version than
3566 // 1.0 as ignorable.
3567 static bool IsIgnorable (int i)
3571 // I guess, those characters are added between
3572 // Unicode 1.0 (LCMapString) and Unicode 3.1
3573 // (UnicodeCategory), so they used to be
3574 // something like OtherNotAssigned as of Unicode 1.1.
3575 case 0x2df: case 0x387:
3576 case 0x3d7: case 0x3d8: case 0x3d9:
3577 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3578 case 0x400: case 0x40d: case 0x450: case 0x45d:
3579 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3580 case 0x653: case 0x654: case 0x655: case 0x66d:
3582 case 0x1e9b: case 0x202f: case 0x20ad:
3583 case 0x20ae: case 0x20af:
3584 case 0x20e2: case 0x20e3:
3585 case 0x2139: case 0x213a: case 0x2183:
3586 case 0x2425: case 0x2426: case 0x2619:
3587 case 0x2670: case 0x2671: case 0x3007:
3588 case 0x3190: case 0x3191:
3589 case 0xfffc: case 0xfffd:
3591 // exceptional characters filtered by the
3592 // following conditions. Originally those exceptional
3593 // ranges are incorrect (they should not be ignored)
3594 // and most of those characters are unfortunately in
3596 case 0x4d8: case 0x4d9:
3597 case 0x4e8: case 0x4e9:
3599 case 0x3036: case 0x303f:
3600 case 0x337b: case 0xfb1e:
3605 // The whole Sinhala characters.
3606 0x0D82 <= i && i <= 0x0DF4
3607 // The whole Tibetan characters.
3608 || 0x0F00 <= i && i <= 0x0FD1
3609 // The whole Myanmar characters.
3610 || 0x1000 <= i && i <= 0x1059
3611 // The whole Etiopic, Cherokee,
3612 // Canadian Syllablic, Ogham, Runic,
3613 // Tagalog, Hanunoo, Philippine,
3614 // Buhid, Tagbanwa, Khmer and Mongorian
3616 || 0x1200 <= i && i <= 0x1DFF
3617 // Greek extension characters.
3618 || 0x1F00 <= i && i <= 0x1FFF
3619 // The whole Braille characters.
3620 || 0x2800 <= i && i <= 0x28FF
3621 // CJK radical characters.
3622 || 0x2E80 <= i && i <= 0x2EF3
3623 // Kangxi radical characters.
3624 || 0x2F00 <= i && i <= 0x2FD5
3625 // Ideographic description characters.
3626 || 0x2FF0 <= i && i <= 0x2FFB
3627 // Bopomofo letter and final
3628 || 0x31A0 <= i && i <= 0x31B7
3629 // White square with quadrant characters.
3630 || 0x25F0 <= i && i <= 0x25F7
3631 // Ideographic telegraph symbols.
3632 || 0x32C0 <= i && i <= 0x32CB
3633 || 0x3358 <= i && i <= 0x3370
3634 || 0x33E0 <= i && i <= 0x33FF
3635 // The whole YI characters.
3636 || 0xA000 <= i && i <= 0xA48C
3637 || 0xA490 <= i && i <= 0xA4C6
3638 // American small ligatures
3639 || 0xFB13 <= i && i <= 0xFB17
3640 // hebrew, arabic, variation selector.
3641 || 0xFB1D <= i && i <= 0xFE2F
3642 // Arabic ligatures.
3643 || 0xFEF5 <= i && i <= 0xFEFC
3644 // FIXME: why are they excluded?
3645 || 0x01F6 <= i && i <= 0x01F9
3646 || 0x0218 <= i && i <= 0x0233
3647 || 0x02A9 <= i && i <= 0x02AD
3648 || 0x02EA <= i && i <= 0x02EE
3649 || 0x0349 <= i && i <= 0x036F
3650 || 0x0488 <= i && i <= 0x048F
3651 || 0x04D0 <= i && i <= 0x04FF
3652 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3653 || 0x06D6 <= i && i <= 0x06ED
3654 || 0x06FA <= i && i <= 0x06FE
3655 || 0x2048 <= i && i <= 0x204D
3656 || 0x20e4 <= i && i <= 0x20ea
3657 || 0x213C <= i && i <= 0x214B
3658 || 0x21EB <= i && i <= 0x21FF
3659 || 0x22F2 <= i && i <= 0x22FF
3660 || 0x237B <= i && i <= 0x239A
3661 || 0x239B <= i && i <= 0x23CF
3662 || 0x24EB <= i && i <= 0x24FF
3663 || 0x2596 <= i && i <= 0x259F
3664 || 0x25F8 <= i && i <= 0x25FF
3665 || 0x2672 <= i && i <= 0x2689
3666 || 0x2768 <= i && i <= 0x2775
3667 || 0x27d0 <= i && i <= 0x27ff
3668 || 0x2900 <= i && i <= 0x2aff
3669 || 0x3033 <= i && i <= 0x303F
3670 || 0x31F0 <= i && i <= 0x31FF
3671 || 0x3250 <= i && i <= 0x325F
3672 || 0x32B1 <= i && i <= 0x32BF
3673 || 0x3371 <= i && i <= 0x337B
3674 || 0xFA30 <= i && i <= 0xFA6A
3678 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3680 case UnicodeCategory.PrivateUse:
3681 case UnicodeCategory.Surrogate:
3683 // ignored by nature
3684 case UnicodeCategory.Format:
3685 case UnicodeCategory.OtherNotAssigned:
3692 // To check IsIgnorable sanity, try the driver below under MS.NET.
3695 public static void Main ()
3697 for (int i = 0; i <= char.MaxValue; i++)
3698 Dump (i, IsIgnorable (i));
3701 static void Dump (int i, bool ignore)
3703 switch (Char.GetUnicodeCategory ((char) i)) {
3704 case UnicodeCategory.PrivateUse:
3705 case UnicodeCategory.Surrogate:
3706 return; // check nothing
3710 string s2 = new string ((char) i, 10);
3711 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3712 if ((ret == 0) == ignore)
3714 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3717 #endregion // IsIgnorable
3719 #region IsIgnorableSymbol
3720 static bool IsIgnorableSymbol (int i)
3722 if (IsIgnorable (i))
3727 case 0x00b5: case 0x01C0: case 0x01C1:
3728 case 0x01C2: case 0x01C3: case 0x01F6:
3729 case 0x01F7: case 0x01F8: case 0x01F9:
3730 case 0x02D0: case 0x02EE: case 0x037A:
3731 case 0x03D7: case 0x03F3:
3732 case 0x0400: case 0x040d:
3733 case 0x0450: case 0x045d:
3734 case 0x048C: case 0x048D:
3735 case 0x048E: case 0x048F:
3736 case 0x0587: case 0x0640: case 0x06E5:
3737 case 0x06E6: case 0x06FA: case 0x06FB:
3738 case 0x06FC: case 0x093D: case 0x0950:
3739 case 0x1E9B: case 0x2139: case 0x3006:
3740 case 0x3033: case 0x3034: case 0x3035:
3741 case 0xFE7E: case 0xFE7F:
3743 case 0x16EE: case 0x16EF: case 0x16F0:
3745 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3746 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3747 case 0x3038: // HANGZHOU NUMERAL TEN
3748 case 0x3039: // HANGZHOU NUMERAL TWENTY
3749 case 0x303a: // HANGZHOU NUMERAL THIRTY
3755 case 0x02B9: case 0x02BA: case 0x02C2:
3756 case 0x02C3: case 0x02C4: case 0x02C5:
3757 case 0x02C8: case 0x02CC: case 0x02CD:
3758 case 0x02CE: case 0x02CF: case 0x02D2:
3759 case 0x02D3: case 0x02D4: case 0x02D5:
3760 case 0x02D6: case 0x02D7: case 0x02DE:
3761 case 0x02E5: case 0x02E6: case 0x02E7:
3762 case 0x02E8: case 0x02E9:
3763 case 0x309B: case 0x309C:
3765 case 0x055A: // American Apos
3766 case 0x05C0: // Hebrew Punct
3767 case 0x0E4F: // Thai FONGMAN
3768 case 0x0E5A: // Thai ANGKHANKHU
3769 case 0x0E5B: // Thai KHOMUT
3771 case 0x09F2: // Bengali Rupee Mark
3772 case 0x09F3: // Bengali Rupee Sign
3774 case 0x221e: // INF.
3783 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3785 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3786 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3791 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3793 case UnicodeCategory.Surrogate:
3794 return false; // inconsistent
3796 case UnicodeCategory.SpacingCombiningMark:
3797 case UnicodeCategory.EnclosingMark:
3798 case UnicodeCategory.NonSpacingMark:
3799 case UnicodeCategory.PrivateUse:
3801 if (0x064B <= i && i <= 0x0652) // Arabic
3805 case UnicodeCategory.Format:
3806 case UnicodeCategory.OtherNotAssigned:
3813 // latin in a circle
3814 0x249A <= i && i <= 0x24E9
3815 || 0x2100 <= i && i <= 0x2132
3817 || 0x3196 <= i && i <= 0x31A0
3819 || 0x3200 <= i && i <= 0x321C
3821 || 0x322A <= i && i <= 0x3243
3823 || 0x3260 <= i && i <= 0x32B0
3824 || 0x32D0 <= i && i <= 0x3357
3825 || 0x337B <= i && i <= 0x33DD
3827 use = !Char.IsLetterOrDigit ((char) i);
3831 // This "Digit" rule is mystery.
3832 // It filters some symbols out.
3833 if (Char.IsLetterOrDigit ((char) i))
3835 if (Char.IsNumber ((char) i))
3837 if (Char.IsControl ((char) i)
3838 || Char.IsSeparator ((char) i)
3839 || Char.IsPunctuation ((char) i))
3841 if (Char.IsSymbol ((char) i))
3844 // FIXME: should check more
3849 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3851 public static void Main ()
3853 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3854 for (int i = 0; i <= char.MaxValue; i++) {
3855 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3856 if (uc == UnicodeCategory.Surrogate)
3859 bool ret = IsIgnorableSymbol (i);
3861 string s1 = "TEST ";
3862 string s2 = "TEST " + (char) i;
3864 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3866 if (ret != (result == 0))
3867 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3868 ret ? "should not ignore" :
3877 static bool IsIgnorableNonSpacing (int i)
3879 if (IsIgnorable (i))
3883 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3884 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3885 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3887 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3888 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3889 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3890 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3891 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3892 case 0x0CCD: case 0x0E4E:
3896 if (0x02b9 <= i && i <= 0x02c5
3897 || 0x02cc <= i && i <= 0x02d7
3898 || 0x02e4 <= i && i <= 0x02ef
3899 || 0x20DD <= i && i <= 0x20E0
3903 if (0x064B <= i && i <= 0x00652
3904 || 0x0941 <= i && i <= 0x0948
3905 || 0x0AC1 <= i && i <= 0x0ACD
3906 || 0x0C3E <= i && i <= 0x0C4F
3907 || 0x0E31 <= i && i <= 0x0E3F
3911 return Char.GetUnicodeCategory ((char) i) ==
3912 UnicodeCategory.NonSpacingMark;
3915 // We can reuse IsIgnorableSymbol testcode
3916 // for IsIgnorableNonSpacing.
3922 public byte Category;
3924 public byte Level2; // It is always single byte.
3925 public bool Defined;
3927 public CharMapEntry (byte category, byte level1, byte level2)
3929 Category = category;
3938 public readonly int CP;
3939 public readonly int JIS;
3941 public JISCharacter (int cp, int cpJIS)
3948 class JISComparer : IComparer
3950 public static readonly JISComparer Instance =
3953 public int Compare (object o1, object o2)
3955 JISCharacter j1 = (JISCharacter) o1;
3956 JISCharacter j2 = (JISCharacter) o2;
3957 return j1.JIS - j2.JIS;
3961 class NonJISCharacter
3963 public readonly int CP;
3964 public readonly string Name;
3966 public NonJISCharacter (int cp, string name)
3973 class NonJISComparer : IComparer
3975 public static readonly NonJISComparer Instance =
3976 new NonJISComparer ();
3978 public int Compare (object o1, object o2)
3980 NonJISCharacter j1 = (NonJISCharacter) o1;
3981 NonJISCharacter j2 = (NonJISCharacter) o2;
3982 return string.CompareOrdinal (j1.Name, j2.Name);
3986 class DecimalDictionaryValueComparer : IComparer
3988 public static readonly DecimalDictionaryValueComparer Instance
3989 = new DecimalDictionaryValueComparer ();
3991 private DecimalDictionaryValueComparer ()
3995 public int Compare (object o1, object o2)
3997 DictionaryEntry e1 = (DictionaryEntry) o1;
3998 DictionaryEntry e2 = (DictionaryEntry) o2;
3999 // FIXME: in case of 0, compare decomposition categories
4000 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4003 int i1 = (int) e1.Key;
4004 int i2 = (int) e2.Key;
4009 class StringDictionaryValueComparer : IComparer
4011 public static readonly StringDictionaryValueComparer Instance
4012 = new StringDictionaryValueComparer ();
4014 private StringDictionaryValueComparer ()
4018 public int Compare (object o1, object o2)
4020 DictionaryEntry e1 = (DictionaryEntry) o1;
4021 DictionaryEntry e2 = (DictionaryEntry) o2;
4022 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4025 int i1 = (int) e1.Key;
4026 int i2 = (int) e2.Key;
4031 class UCAComparer : IComparer
4033 public static readonly UCAComparer Instance
4034 = new UCAComparer ();
4036 private UCAComparer ()
4040 public int Compare (object o1, object o2)
4042 char i1 = (char) o1;
4043 char i2 = (char) o2;
4045 int l1 = CollationElementTable.GetSortKeyCount (i1);
4046 int l2 = CollationElementTable.GetSortKeyCount (i2);
4047 int l = l1 > l2 ? l2 : l1;
4049 for (int i = 0; i < l; i++) {
4050 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4051 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4052 int v = k1.Primary - k2.Primary;
4055 v = k1.Secondary - k2.Secondary;
4058 v = k1.Thirtiary - k2.Thirtiary;
4061 v = k1.Quarternary - k2.Quarternary;
4074 ArrayList items = new ArrayList ();
4076 public Tailoring (int lcid)
4081 public Tailoring (int lcid, int alias)
4088 get { return lcid; }
4092 get { return alias; }
4095 public bool FrenchSort {
4096 get { return frenchSort; }
4097 set { frenchSort = value; }
4100 public void AddDiacriticalMap (byte target, byte replace)
4102 items.Add (new DiacriticalMap (target, replace));
4105 public void AddSortKeyMap (string source, byte [] sortkey)
4107 items.Add (new SortKeyMap (source, sortkey));
4110 public void AddReplacementMap (string source, string replace)
4112 items.Add (new ReplacementMap (source, replace));
4115 public char [] ItemToCharArray ()
4117 ArrayList al = new ArrayList ();
4118 foreach (ITailoringMap m in items)
4119 al.AddRange (m.ToCharArray ());
4120 return al.ToArray (typeof (char)) as char [];
4123 interface ITailoringMap
4125 char [] ToCharArray ();
4128 class DiacriticalMap : ITailoringMap
4130 public readonly byte Target;
4131 public readonly byte Replace;
4133 public DiacriticalMap (byte target, byte replace)
4139 public char [] ToCharArray ()
4141 char [] ret = new char [3];
4142 ret [0] = (char) 02; // kind:DiacriticalMap
4143 ret [1] = (char) Target;
4144 ret [2] = (char) Replace;
4149 class SortKeyMap : ITailoringMap
4151 public readonly string Source;
4152 public readonly byte [] SortKey;
4154 public SortKeyMap (string source, byte [] sortkey)
4160 public char [] ToCharArray ()
4162 char [] ret = new char [Source.Length + 7];
4163 ret [0] = (char) 01; // kind:SortKeyMap
4164 for (int i = 0; i < Source.Length; i++)
4165 ret [i + 1] = Source [i];
4167 for (int i = 0; i < 4; i++)
4168 ret [i + Source.Length + 2] = (char) SortKey [i];
4173 class ReplacementMap : ITailoringMap
4175 public readonly string Source;
4176 public readonly string Replace;
4178 public ReplacementMap (string source, string replace)
4184 public char [] ToCharArray ()
4186 char [] ret = new char [Source.Length + Replace.Length + 3];
4187 ret [0] = (char) 03; // kind:ReplaceMap
4189 for (int i = 0; i < Source.Length; i++)
4190 ret [pos++] = Source [i];
4193 for (int i = 0; i < Replace.Length; i++)
4194 ret [pos++] = Replace [i];