3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
99 // LATIN, CYRILLIC etc.
100 "UPTURN", "DOUBLE-STRUCK",
101 "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
102 "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
103 "WITH ACUTE;", "WITH GRAVE;",
105 "WITH DOT ABOVE;", " MIDDLE DOT;",
106 "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
108 "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
109 "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
110 "WITH OGONEK;", "WITH CEDILLA;",
112 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
113 "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
115 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
116 " DIAERESIS AND GRAVE;",
118 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
119 " MACRON AND ACUTE;",
120 " MACRON AND GRAVE;",
122 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
123 " RING ABOVE AND ACUTE",
124 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
125 " CIRCUMFLEX AND TILDE",
126 " TILDE AND DIAERESIS",
129 " CEDILLA AND BREVE",
130 " OGONEK AND MACRON",
133 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
137 " PRECEDED BY APOSTROPHE",
139 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
142 " RETROFLEX;", "DIAERESIS BELOW",
145 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
146 " BREVE BELOW;", " HORN AND GRAVE",
149 " DOT BELOW AND DOT ABOVE",
150 " RIGHT HALF RING", " HORN AND TILDE",
151 " CIRCUMFLEX AND DOT BELOW",
152 " BREVE AND DOT BELOW",
153 " DOT BELOW AND MACRON",
155 " HORN AND HOOK ABOVE",
157 // CIRCLED, PARENTHESIZED and so on
158 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
159 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
160 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
162 byte [] diacriticWeights = new byte [] {
168 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
169 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
171 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
172 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
174 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
175 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
177 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
178 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
180 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
181 0x69, 0x69, 0x6A, 0x6D, 0x6E,
183 // CIRCLED, PARENTHESIZED and so on.
184 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
188 int [] numberSecondaryWeightBounds = new int [] {
189 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
190 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
191 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
192 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
193 0xE50, 0xE60, 0xED0, 0xEE0
196 char [] orderedGurmukhi;
197 char [] orderedGujarati;
198 char [] orderedGeorgian;
199 char [] orderedThaana;
201 static readonly char [] orderedTamilConsonants = new char [] {
202 // based on traditional Tamil consonants, except for
203 // Grantha (where Microsoft breaks traditionalism).
204 // http://www.angelfire.com/empire/thamizh/padanGaL
205 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
206 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
207 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
208 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
211 // cp -> character name (only for some characters)
212 ArrayList sortableCharNames = new ArrayList ();
214 // cp -> arrow value (int)
215 ArrayList arrowValues = new ArrayList ();
217 // cp -> box value (int)
218 ArrayList boxValues = new ArrayList ();
220 // cp -> level1 value
221 Hashtable arabicLetterPrimaryValues = new Hashtable ();
224 Hashtable arabicNameMap = new Hashtable ();
226 // cp -> Hashtable [decompType] -> cp
227 Hashtable nfkdMap = new Hashtable ();
229 // Latin letter -> ArrayList [int]
230 Hashtable latinMap = new Hashtable ();
232 ArrayList jisJapanese = new ArrayList ();
233 ArrayList nonJisJapanese = new ArrayList ();
235 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
236 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
237 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
238 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
239 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
241 byte [] ignorableFlags = new byte [char.MaxValue + 1];
243 static double [] unicodeAge = new double [char.MaxValue + 1];
245 ArrayList tailorings = new ArrayList ();
247 void Run (string [] args)
249 string dirname = args.Length == 0 ? "downloaded" : args [0];
250 ParseSources (dirname);
251 Console.Error.WriteLine ("parse done.");
253 ModifyParsedValues ();
255 Console.Error.WriteLine ("generation done.");
257 Console.Error.WriteLine ("serialization done.");
259 StreamWriter sw = new StreamWriter ("agelog.txt");
260 for (int i = 0; i < char.MaxValue; i++) {
261 bool shouldBe = false;
262 switch (Char.GetUnicodeCategory ((char) i)) {
263 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
264 shouldBe = true; break;
266 if (unicodeAge [i] >= 3.1)
268 //if (IsIgnorable (i) != shouldBe)
269 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
275 byte [] CompressArray (byte [] source, CodePointIndexer i)
277 return (byte []) CodePointIndexer.CompressArray (
278 source, typeof (byte), i);
281 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
283 return (ushort []) CodePointIndexer.CompressArray (
284 source, typeof (ushort), i);
290 SerializeTailorings ();
292 byte [] categories = new byte [map.Length];
293 byte [] level1 = new byte [map.Length];
294 byte [] level2 = new byte [map.Length];
295 byte [] level3 = new byte [map.Length];
296 ushort [] widthCompat = new ushort [map.Length];
297 for (int i = 0; i < map.Length; i++) {
298 categories [i] = map [i].Category;
299 level1 [i] = map [i].Level1;
300 level2 [i] = map [i].Level2;
301 level3 [i] = ComputeLevel3Weight ((char) i);
302 // For Japanese Half-width characters, don't
303 // map widthCompat. It is IgnoreKanaType that
304 // handles those width differences.
305 if (0xFF6D <= i && i <= 0xFF9D)
307 switch (decompType [i]) {
308 case DecompositionNarrow:
309 case DecompositionWide:
310 case DecompositionSuper:
311 case DecompositionSub:
312 // they are always 1 char
313 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
319 ignorableFlags = CompressArray (ignorableFlags,
320 MSCompatUnicodeTableUtil.Ignorable);
321 categories = CompressArray (categories,
322 MSCompatUnicodeTableUtil.Category);
323 level1 = CompressArray (level1,
324 MSCompatUnicodeTableUtil.Level1);
325 level2 = CompressArray (level2,
326 MSCompatUnicodeTableUtil.Level2);
327 level3 = CompressArray (level3,
328 MSCompatUnicodeTableUtil.Level3);
329 widthCompat = (ushort []) CodePointIndexer.CompressArray (
330 widthCompat, typeof (ushort),
331 MSCompatUnicodeTableUtil.WidthCompat);
332 cjkCHS = CompressArray (cjkCHS,
333 MSCompatUnicodeTableUtil.CjkCHS);
334 cjkCHT = CompressArray (cjkCHT,
335 MSCompatUnicodeTableUtil.Cjk);
336 cjkJA = CompressArray (cjkJA,
337 MSCompatUnicodeTableUtil.Cjk);
338 cjkKO = CompressArray (cjkKO,
339 MSCompatUnicodeTableUtil.Cjk);
340 cjkKOlv2 = CompressArray (cjkKOlv2,
341 MSCompatUnicodeTableUtil.Cjk);
344 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
346 MemoryStream ms = new MemoryStream ();
347 BinaryWriter binary = new BinaryWriter (ms);
348 binary.Write (ignorableFlags.Length);
350 for (int i = 0; i < ignorableFlags.Length; i++) {
351 byte value = ignorableFlags [i];
353 Result.Write ("{0},", value);
355 Result.Write ("0x{0:X02},", value);
357 binary.Write (value);
359 if ((i & 0xF) == 0xF)
360 Result.WriteLine ("// {0:X04}", i - 0xF);
362 Result.WriteLine ("};");
366 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
368 binary.Write (categories.Length);
370 for (int i = 0; i < categories.Length; i++) {
371 byte value = categories [i];
373 Result.Write ("{0},", value);
375 Result.Write ("0x{0:X02},", value);
377 binary.Write (value);
379 if ((i & 0xF) == 0xF)
380 Result.WriteLine ("// {0:X04}", i - 0xF);
382 Result.WriteLine ("};");
385 // Primary weight value
386 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
388 binary.Write (level1.Length);
390 for (int i = 0; i < level1.Length; i++) {
391 byte value = level1 [i];
393 Result.Write ("{0},", value);
395 Result.Write ("0x{0:X02},", value);
397 binary.Write (value);
399 if ((i & 0xF) == 0xF)
400 Result.WriteLine ("// {0:X04}", i - 0xF);
402 Result.WriteLine ("};");
406 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
408 binary.Write (level2.Length);
410 for (int i = 0; i < level2.Length; i++) {
411 byte value = level2 [i];
413 Result.Write ("{0},", value);
415 Result.Write ("0x{0:X02},", value);
417 binary.Write (value);
419 if ((i & 0xF) == 0xF)
420 Result.WriteLine ("// {0:X04}", i - 0xF);
422 Result.WriteLine ("};");
426 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
428 binary.Write (level3.Length);
430 for (int i = 0; i < level3.Length; i++) {
431 byte value = level3 [i];
433 Result.Write ("{0},", value);
435 Result.Write ("0x{0:X02},", value);
437 binary.Write (value);
439 if ((i & 0xF) == 0xF)
440 Result.WriteLine ("// {0:X04}", i - 0xF);
442 Result.WriteLine ("};");
445 // Width insensitivity mappings
446 // (for now it is more lightweight than dumping the
447 // entire NFKD table).
448 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
450 binary.Write (widthCompat.Length);
452 for (int i = 0; i < widthCompat.Length; i++) {
453 ushort value = widthCompat [i];
455 Result.Write ("{0},", value);
457 Result.Write ("0x{0:X02},", value);
459 binary.Write (value);
461 if ((i & 0xF) == 0xF)
462 Result.WriteLine ("// {0:X04}", i - 0xF);
464 Result.WriteLine ("};");
467 using (FileStream fs = File.Create ("../collation.core.bin")) {
468 byte [] array = ms.ToArray ();
469 fs.Write (array, 0, array.Length);
474 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
475 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
476 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
477 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
478 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
481 void SerializeCJK (string name, ushort [] cjk, int max)
483 int offset = 0;//char.MaxValue - cjk.Length;
484 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
486 MemoryStream ms = new MemoryStream ();
487 BinaryWriter binary = new BinaryWriter (ms);
488 binary.Write (cjk.Length);
490 for (int i = 0; i < cjk.Length; i++) {
491 if (i + offset == max)
493 ushort value = cjk [i];
495 Result.Write ("{0},", value);
497 Result.Write ("0x{0:X04},", value);
499 binary.Write (value);
501 if ((i & 0xF) == 0xF)
502 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
504 Result.WriteLine ("};");
507 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
508 byte [] array = ms.ToArray ();
509 fs.Write (array, 0, array.Length);
514 void SerializeCJK (string name, byte [] cjk, int max)
516 int offset = 0;//char.MaxValue - cjk.Length;
517 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
519 MemoryStream ms = new MemoryStream ();
520 BinaryWriter binary = new BinaryWriter (ms);
522 for (int i = 0; i < cjk.Length; i++) {
523 if (i + offset == max)
525 byte value = cjk [i];
527 Result.Write ("{0},", value);
529 Result.Write ("0x{0:X02},", value);
531 binary.Write (value);
533 if ((i & 0xF) == 0xF)
534 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
536 Result.WriteLine ("};");
539 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
540 byte [] array = ms.ToArray ();
541 fs.Write (array, 0, array.Length);
546 void SerializeTailorings ()
548 Hashtable indexes = new Hashtable ();
549 Hashtable counts = new Hashtable ();
550 Result.WriteLine ("static char [] tailorings = new char [] {");
553 MemoryStream ms = new MemoryStream ();
554 BinaryWriter binary = new BinaryWriter (ms);
556 foreach (Tailoring t in tailorings) {
559 Result.Write ("/*{0}*/", t.LCID);
560 indexes.Add (t.LCID, count);
561 char [] values = t.ItemToCharArray ();
562 counts.Add (t.LCID, values.Length);
563 foreach (char c in values) {
564 Result.Write ("'\\x{0:X}', ", (int) c);
565 if (++count % 16 == 0)
566 Result.WriteLine (" // {0:X04}", count - 16);
568 binary.Write ((ushort) c);
572 Result.WriteLine ("};");
574 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
576 byte [] rawdata = ms.ToArray ();
577 ms = new MemoryStream ();
578 binary = new BinaryWriter (ms);
579 binary.Write (tailorings.Count);
581 foreach (Tailoring t in tailorings) {
582 int target = t.Alias != 0 ? t.Alias : t.LCID;
583 if (!indexes.ContainsKey (target)) {
584 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
587 int idx = (int) indexes [target];
588 int cnt = (int) counts [target];
589 bool french = t.FrenchSort;
591 foreach (Tailoring t2 in tailorings)
592 if (t2.LCID == t.LCID)
593 french = t2.FrenchSort;
594 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
596 binary.Write (t.LCID);
599 binary.Write (french);
602 Result.WriteLine ("};");
604 binary.Write ((byte) 0xFF);
605 binary.Write ((byte) 0xFF);
606 binary.Write (rawdata.Length / 2);
607 binary.Write (rawdata, 0, rawdata.Length);
610 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
611 byte [] array = ms.ToArray ();
612 fs.Write (array, 0, array.Length);
619 void ParseSources (string dirname)
622 dirname + "/UnicodeData.txt";
623 string derivedCoreProps =
624 dirname + "/DerivedCoreProperties.txt";
626 dirname + "/Scripts.txt";
628 dirname + "/CP932.TXT";
630 dirname + "/DerivedAge.txt";
631 string chXML = dirname + "/common/collation/zh.xml";
632 string jaXML = dirname + "/common/collation/ja.xml";
633 string koXML = dirname + "/common/collation/ko.xml";
635 ParseDerivedAge (derivedAge);
639 ParseJISOrder (cp932); // in prior to ParseUnidata()
640 ParseUnidata (unidata);
642 ParseDerivedCoreProperties (derivedCoreProps);
643 ParseScripts (scripts);
644 ParseCJK (chXML, jaXML, koXML);
646 ParseTailorings ("mono-tailoring-source.txt");
649 void ParseTailorings (string filename)
653 using (StreamReader sr = new StreamReader (filename)) {
655 while (sr.Peek () >= 0) {
657 ProcessTailoringLine (ref t,
658 sr.ReadLine ().Trim ());
660 } catch (Exception) {
661 Console.Error.WriteLine ("ERROR at line {0}", line);
667 // For now this is enough.
668 string ParseTailoringSourceValue (string s)
670 StringBuilder sb = new StringBuilder ();
671 for (int i = 0; i < s.Length; i++) {
672 if (s.StartsWith ("\\u")) {
673 sb.Append ((char) int.Parse (
674 s.Substring (2, 4), NumberStyles.HexNumber),
681 return sb.ToString ();
684 void ProcessTailoringLine (ref Tailoring t, string s)
686 int idx = s.IndexOf ('#');
688 s = s.Substring (0, idx).Trim ();
689 if (s.Length == 0 || s [0] == '#')
692 idx = s.IndexOf ('=');
695 int.Parse (s.Substring (1, idx - 1)),
696 int.Parse (s.Substring (idx + 1)));
698 t = new Tailoring (int.Parse (s.Substring (1)));
702 if (s.StartsWith ("*FrenchSort")) {
706 string d = "*Diacritical";
707 if (s.StartsWith (d)) {
708 idx = s.IndexOf ("->");
709 t.AddDiacriticalMap (
710 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
711 NumberStyles.HexNumber),
712 byte.Parse (s.Substring (idx + 2).Trim (),
713 NumberStyles.HexNumber));
716 idx = s.IndexOf (':');
718 string source = s.Substring (0, idx).Trim ();
719 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
720 byte [] b = new byte [4];
721 for (int i = 0; i < 4; i++) {
725 b [i] = byte.Parse (l [i],
726 NumberStyles.HexNumber);
728 t.AddSortKeyMap (ParseTailoringSourceValue (source),
731 idx = s.IndexOf ('=');
733 t.AddReplacementMap (
734 ParseTailoringSourceValue (
735 s.Substring (0, idx).Trim ()),
736 ParseTailoringSourceValue (
737 s.Substring (idx + 1).Trim ()));
740 void ParseDerivedAge (string filename)
742 using (StreamReader file =
743 new StreamReader (filename)) {
744 while (file.Peek () >= 0) {
745 string s = file.ReadLine ();
746 int idx = s.IndexOf ('#');
748 s = s.Substring (0, idx);
749 idx = s.IndexOf (';');
753 string cpspec = s.Substring (0, idx);
754 idx = cpspec.IndexOf ("..");
755 NumberStyles nf = NumberStyles.HexNumber |
756 NumberStyles.AllowTrailingWhite;
757 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
758 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
759 string value = s.Substring (cpspec.Length + 1).Trim ();
762 if (cp > char.MaxValue)
765 double v = double.Parse (value);
766 for (int i = cp; i <= cpEnd; i++)
770 unicodeAge [0] = double.MaxValue; // never be supported
773 void ParseUnidata (string filename)
775 ArrayList decompValues = new ArrayList ();
776 using (StreamReader unidata =
777 new StreamReader (filename)) {
778 for (int line = 1; unidata.Peek () >= 0; line++) {
780 ProcessUnidataLine (unidata.ReadLine (), decompValues);
781 } catch (Exception) {
782 Console.Error.WriteLine ("**** At line " + line);
787 this.decompValues = (int [])
788 decompValues.ToArray (typeof (int));
791 char previousLatinTarget = char.MinValue;
792 byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
794 void ProcessUnidataLine (string s, ArrayList decompValues)
796 int idx = s.IndexOf ('#');
798 s = s.Substring (0, idx);
799 idx = s.IndexOf (';');
802 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
803 string [] values = s.Substring (idx + 1).Split (';');
806 if (cp > char.MaxValue)
808 if (IsIgnorable (cp))
811 string name = values [0];
813 // SPECIAL CASE: rename some characters for diacritical
814 // remapping. FIXME: why are they different?
815 // FIXME: it's still not working.
816 if (cp == 0x018B || cp == 0x018C)
817 name = name.Replace ("TOPBAR", "STROKE");
820 if (s.IndexOf ("SMALL CAPITAL") > 0)
821 isSmallCapital [cp] = true;
823 // latin mapping by character name
824 if (s.IndexOf ("LATIN") >= 0) {
825 int lidx = s.IndexOf ("LETTER DOTLESS ");
826 int offset = lidx + 15;
828 lidx = s.IndexOf ("LETTER TURNED ");
832 lidx = s.IndexOf ("LETTER CAPITAL ");
836 lidx = s.IndexOf ("LETTER SCRIPT ");
840 lidx = s.IndexOf ("LETTER ");
843 char c = lidx > 0 ? s [offset] : char.MinValue;
844 char n = s [offset + 1];
845 char target = char.MinValue;
846 if ('A' <= c && c <= 'Z' &&
847 (n == ' ') || n == ';') {
849 // FIXME: After 'Z', I cannot reset this state.
850 previousLatinTarget = c == 'Z' ? char.MinValue : c;
853 if (s.Substring (offset).StartsWith ("ALPHA"))
855 else if (s.Substring (offset).StartsWith ("TONE SIX"))
857 else if (s.Substring (offset).StartsWith ("OPEN O"))
859 else if (s.Substring (offset).StartsWith ("SCHWA"))
861 else if (s.Substring (offset).StartsWith ("ENG"))
863 else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
865 else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
867 else if (s.Substring (offset).StartsWith ("TONE TWO"))
869 else if (s.Substring (offset).StartsWith ("ESH"))
872 // For remaining IPA chars, direct mapping is
875 case 0x0299: target = 'B'; break;
876 case 0x029A: target = 'E'; break;
877 case 0x029B: target = 'G'; break;
878 case 0x029C: target = 'H'; break;
879 case 0x029D: target = 'J'; break;
880 case 0x029E: target = 'K'; break;
881 case 0x029F: target = 'L'; break;
882 case 0x02A0: target = 'Q'; break;
883 case 0x02A7: target = 'T'; break;
884 case 0x02A8: target = 'T'; break;
887 if (target == char.MinValue)
888 target = previousLatinTarget;
890 if (target != char.MinValue) {
891 ArrayList entry = (ArrayList) latinMap [target];
893 entry = new ArrayList ();
894 latinMap [target] = entry;
897 // FIXME: This secondary weight is hack.
898 // They are here because they must not
899 // be identical to the corresponding
901 if (c != target && diacritical [cp] == 0) {
902 diacriticalOffset [c - 'A']++;
903 diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
909 if (0x2000 <= cp && cp < 0x3000) {
911 // SPECIAL CASES. FIXME: why?
913 case 0x21C5: value = -1; break; // E2
914 case 0x261D: value = 1; break;
915 case 0x27A6: value = 3; break;
916 case 0x21B0: value = 7; break;
917 case 0x21B1: value = 3; break;
918 case 0x21B2: value = 7; break;
919 case 0x21B4: value = 5; break;
920 case 0x21B5: value = 7; break;
921 case 0x21B9: value = -1; break; // E1
922 case 0x21CF: value = 7; break;
923 case 0x21D0: value = 3; break;
925 string [] arrowTargets = new string [] {
938 if (s.IndexOf ("RIGHTWARDS") >= 0 &&
939 s.IndexOf ("LEFTWARDS") >= 0)
941 else if (s.IndexOf ("UPWARDS") >= 0 &&
942 s.IndexOf ("DOWNWARDS") >= 0)
944 else if (s.IndexOf ("ARROW") >= 0 &&
945 s.IndexOf ("COMBINING") < 0 &&
946 s.IndexOf ("CLOCKWISE") >= 0)
947 value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
949 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
950 if (s.IndexOf (arrowTargets [i]) > 0 &&
951 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
952 s.IndexOf (" OVER") < 0
956 arrowValues.Add (new DictionaryEntry (
961 if (0x2500 <= cp && cp < 0x2600) {
962 int value = int.MinValue;
964 // up:1 down:2 right:4 left:8 vert:16 horiz:32
967 // [dr] [dl] [ur] [ul]
971 ArrayList flags = new ArrayList (new int [] {
974 4 + 2, 8 + 2, 4 + 1, 8 + 1,
975 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
976 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
977 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
979 byte [] offsets = new byte [] {
986 if (s.IndexOf ("BOX DRAWINGS ") >= 0) {
988 if (s.IndexOf (" UP") >= 0)
990 if (s.IndexOf (" DOWN") >= 0)
992 if (s.IndexOf (" RIGHT") >= 0)
994 if (s.IndexOf (" LEFT") >= 0)
996 if (s.IndexOf (" VERTICAL") >= 0)
998 if (s.IndexOf (" HORIZONTAL") >= 0)
1001 int fidx = flags.IndexOf (flag);
1003 value = offsets [fidx];
1004 } else if (s.IndexOf ("BLOCK") >= 0) {
1005 if (s.IndexOf ("ONE EIGHTH") >= 0)
1007 else if (s.IndexOf ("ONE QUARTER") >= 0)
1009 else if (s.IndexOf ("THREE EIGHTHS") >= 0)
1011 else if (s.IndexOf ("HALF") >= 0)
1013 else if (s.IndexOf ("FIVE EIGHTHS") >= 0)
1015 else if (s.IndexOf ("THREE QUARTERS") >= 0)
1017 else if (s.IndexOf ("SEVEN EIGHTHS") >= 0)
1022 else if (s.IndexOf ("SHADE") >= 0)
1024 else if (s.IndexOf ("SQUARE") >= 0)
1025 value = 0xBC - 0xE5;
1026 else if (s.IndexOf ("VERTICAL RECTANGLE") >= 0)
1027 value = 0xBE - 0xE5;
1028 else if (s.IndexOf ("RECTANGLE") >= 0)
1029 value = 0xBD - 0xE5;
1030 else if (s.IndexOf ("PARALLELOGRAM") >= 0)
1031 value = 0xBF - 0xE5;
1032 else if (s.IndexOf ("TRIANGLE") >= 0) {
1033 if (s.IndexOf ("UP-POINTING") >= 0)
1034 value = 0xC0 - 0xE5;
1035 else if (s.IndexOf ("RIGHT-POINTING") >= 0)
1036 value = 0xC1 - 0xE5;
1037 else if (s.IndexOf ("DOWN-POINTING") >= 0)
1038 value = 0xC2 - 0xE5;
1039 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1040 value = 0xC3 - 0xE5;
1042 else if (s.IndexOf ("POINTER") >= 0) {
1043 if (s.IndexOf ("RIGHT-POINTING") >= 0)
1044 value = 0xC4 - 0xE5;
1045 else if (s.IndexOf ("LEFT-POINTING") >= 0)
1046 value = 0xC5 - 0xE5;
1048 else if (s.IndexOf ("DIAMOND") >= 0)
1049 value = 0xC6 - 0xE5;
1050 else if (s.IndexOf ("FISHEYE") >= 0)
1051 value = 0xC7 - 0xE5;
1052 else if (s.IndexOf ("LOZENGE") >= 0)
1053 value = 0xC8 - 0xE5;
1054 else if (s.IndexOf ("BULLSEYE") >= 0)
1055 value = 0xC9 - 0xE5;
1056 else if (s.IndexOf ("CIRCLE") >= 0) {
1057 if (cp == 0x25D6) // it could be IndexOf ("LEFT HALF BLACK CIRCLE")
1058 value = 0xCA - 0xE5;
1059 else if (cp == 0x25D7) // it could be IndexOf ("RIGHT HALF BLACK CIRCLE")
1060 value = 0xCB - 0xE5;
1062 value = 0xC9 - 0xE5;
1064 else if (s.IndexOf ("BULLET") >= 0)
1065 value = 0xCC - 0xE5;
1066 if (0x25DA <= cp && cp <= 0x25E5)
1067 value = 0xCD + cp - 0x25DA - 0xE5;
1069 // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
1071 case 0x2571: value = 0xF; break;
1072 case 0x2572: value = 0x10; break;
1073 case 0x2573: value = 0x11; break;
1075 if (value != int.MinValue)
1076 boxValues.Add (new DictionaryEntry (
1080 // For some characters store the name and sort later
1081 // to determine sorting.
1082 if (0x2100 <= cp && cp <= 0x213F &&
1083 Char.IsSymbol ((char) cp))
1084 sortableCharNames.Add (
1085 new DictionaryEntry (cp, name));
1086 else if (0x3380 <= cp && cp <= 0x33DD)
1087 sortableCharNames.Add (new DictionaryEntry (
1088 cp, name.Substring (7)));
1090 if (Char.GetUnicodeCategory ((char) cp) ==
1091 UnicodeCategory.MathSymbol) {
1092 if (name.StartsWith ("CIRCLED "))
1093 diacritical [cp] = 0xEE;
1094 if (name.StartsWith ("SQUARED "))
1095 diacritical [cp] = 0xEF;
1098 // diacritical weights by character name
1099 if (diacritics.Length != diacriticWeights.Length)
1100 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
1101 for (int d = 0; d < diacritics.Length; d++) {
1102 if (s.IndexOf (diacritics [d]) > 0) {
1103 diacritical [cp] += diacriticWeights [d];
1104 if (s.IndexOf ("COMBINING") >= 0)
1105 diacritical [cp] -= (byte) 2;
1108 // also process "COMBINING blah" here
1109 // For now it is limited to cp < 0x0370
1110 // if (cp < 0x0300 || cp >= 0x0370)
1112 string tmp = diacritics [d].TrimEnd (';');
1113 if (tmp.IndexOf ("WITH ") == 0)
1114 tmp = tmp.Substring (4);
1115 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
1117 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
1121 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
1123 // Two-step grep required for it.
1124 if (s.IndexOf ("FULL STOP") > 0 &&
1125 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
1126 diacritical [cp] |= 0xF4;
1127 if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
1128 diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
1129 s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
1131 // Arabic letter name
1132 if (0x0621 <= cp && cp <= 0x064A &&
1133 Char.GetUnicodeCategory ((char) cp)
1134 == UnicodeCategory.OtherLetter) {
1135 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1140 // hamza, waw, yeh ... special cases.
1145 value = 0x77; // special cases.
1148 // Get primary letter name i.e.
1149 // XXX part of ARABIC LETTER XXX yyy
1150 // e.g. that of "TEH MARBUTA" is "TEH".
1153 // 0x0640 is special: it does
1154 // not start with ARABIC LETTER
1156 name.Substring (14);
1157 int tmpIdx = letterName.IndexOf (' ');
1158 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1159 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1160 if (arabicNameMap.ContainsKey (letterName))
1161 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1163 arabicNameMap [letterName] = cp;
1166 arabicLetterPrimaryValues [cp] = value;
1169 // Japanese square letter
1170 if (0x3300 <= cp && cp <= 0x3357)
1171 if (!ExistsJIS (cp))
1172 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1174 // normalizationType
1175 string decomp = values [4];
1176 idx = decomp.IndexOf ('<');
1178 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1180 decompType [cp] = DecompositionFull;
1183 decompType [cp] = DecompositionSub;
1186 decompType [cp] = DecompositionSuper;
1189 decompType [cp] = DecompositionSmall;
1192 decompType [cp] = DecompositionIsolated;
1195 decompType [cp] = DecompositionInitial;
1198 decompType [cp] = DecompositionFinal;
1201 decompType [cp] = DecompositionMedial;
1204 decompType [cp] = DecompositionNoBreak;
1207 decompType [cp] = DecompositionCompat;
1210 decompType [cp] = DecompositionFraction;
1213 decompType [cp] = DecompositionFont;
1216 decompType [cp] = DecompositionCircle;
1219 decompType [cp] = DecompositionSquare;
1222 decompType [cp] = DecompositionWide;
1225 decompType [cp] = DecompositionNarrow;
1228 decompType [cp] = DecompositionVertical;
1231 throw new Exception ("Support NFKD type : " + decomp);
1235 decompType [cp] = DecompositionCanonical;
1236 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1237 if (decomp.Length > 0) {
1239 string [] velems = decomp.Split (' ');
1240 int didx = decompValues.Count;
1241 decompIndex [cp] = didx;
1242 foreach (string v in velems)
1243 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1244 decompLength [cp] = velems.Length;
1246 // [decmpType] -> this_cp
1247 int targetCP = (int) decompValues [didx];
1248 // for "(x)" it specially maps to 'x' .
1249 // FIXME: check if it is sane
1250 if (velems.Length == 3 &&
1251 (int) decompValues [didx] == '(' &&
1252 (int) decompValues [didx + 2] == ')')
1253 targetCP = (int) decompValues [didx + 1];
1254 // special: 0x215F "1/"
1255 else if (cp == 0x215F)
1257 else if (velems.Length > 1 &&
1258 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1259 // skip them, except for CJK ideograph compat
1262 if (targetCP != 0) {
1263 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1264 if (entry == null) {
1265 entry = new Hashtable ();
1266 nfkdMap [targetCP] = entry;
1268 entry [(byte) decompType [cp]] = cp;
1272 if (values [5].Length > 0)
1273 decimalValue [cp] = decimal.Parse (values [5]);
1274 else if (values [6].Length > 0)
1275 decimalValue [cp] = decimal.Parse (values [6]);
1276 else if (values [7].Length > 0) {
1277 string decstr = values [7];
1278 idx = decstr.IndexOf ('/');
1279 if (cp == 0x215F) // special. "1/"
1280 decimalValue [cp] = 0x1;
1284 decimal.Parse (decstr.Substring (0, idx))
1285 / decimal.Parse (decstr.Substring (idx + 1));
1286 else if (decstr [0] == '(' &&
1287 decstr [decstr.Length - 1] == ')')
1290 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1291 else if (decstr [decstr.Length - 1] == '.')
1294 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1296 decimalValue [cp] = decimal.Parse (decstr);
1300 void ParseDerivedCoreProperties (string filename)
1303 using (StreamReader file =
1304 new StreamReader (filename)) {
1305 for (int line = 1; file.Peek () >= 0; line++) {
1307 ProcessDerivedCorePropLine (file.ReadLine ());
1308 } catch (Exception) {
1309 Console.Error.WriteLine ("**** At line " + line);
1316 void ProcessDerivedCorePropLine (string s)
1318 int idx = s.IndexOf ('#');
1320 s = s.Substring (0, idx);
1321 idx = s.IndexOf (';');
1324 string cpspec = s.Substring (0, idx);
1325 idx = cpspec.IndexOf ("..");
1326 NumberStyles nf = NumberStyles.HexNumber |
1327 NumberStyles.AllowTrailingWhite;
1328 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1329 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1330 string value = s.Substring (cpspec.Length + 1).Trim ();
1333 if (cp > char.MaxValue)
1338 for (int x = cp; x <= cpEnd; x++)
1339 isUppercase [x] = true;
1344 void ParseScripts (string filename)
1346 ArrayList gurmukhi = new ArrayList ();
1347 ArrayList gujarati = new ArrayList ();
1348 ArrayList georgian = new ArrayList ();
1349 ArrayList thaana = new ArrayList ();
1351 using (StreamReader file =
1352 new StreamReader (filename)) {
1353 while (file.Peek () >= 0) {
1354 string s = file.ReadLine ();
1355 int idx = s.IndexOf ('#');
1357 s = s.Substring (0, idx);
1358 idx = s.IndexOf (';');
1362 string cpspec = s.Substring (0, idx);
1363 idx = cpspec.IndexOf ("..");
1364 NumberStyles nf = NumberStyles.HexNumber |
1365 NumberStyles.AllowTrailingWhite;
1366 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1367 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1368 string value = s.Substring (cpspec.Length + 1).Trim ();
1371 if (cp > char.MaxValue)
1376 for (int x = cp; x <= cpEnd; x++)
1377 if (!IsIgnorable (x))
1378 gurmukhi.Add ((char) x);
1381 for (int x = cp; x <= cpEnd; x++)
1382 if (!IsIgnorable (x))
1383 gujarati.Add ((char) x);
1386 for (int x = cp; x <= cpEnd; x++)
1387 if (!IsIgnorable (x))
1388 georgian.Add ((char) x);
1391 for (int x = cp; x <= cpEnd; x++)
1392 if (!IsIgnorable (x))
1393 thaana.Add ((char) x);
1398 gurmukhi.Sort (UCAComparer.Instance);
1399 gujarati.Sort (UCAComparer.Instance);
1400 georgian.Sort (UCAComparer.Instance);
1401 thaana.Sort (UCAComparer.Instance);
1402 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1403 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1404 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1405 orderedThaana = (char []) thaana.ToArray (typeof (char));
1408 void ParseJISOrder (string filename)
1412 using (StreamReader file =
1413 new StreamReader (filename)) {
1414 for (;file.Peek () >= 0; line++)
1415 ProcessJISOrderLine (file.ReadLine ());
1417 } catch (Exception) {
1418 Console.Error.WriteLine ("---- line {0}", line);
1423 char [] ws = new char [] {'\t', ' '};
1425 void ProcessJISOrderLine (string s)
1427 int idx = s.IndexOf ('#');
1429 s = s.Substring (0, idx).Trim ();
1432 idx = s.IndexOfAny (ws);
1435 // They start with "0x" so cut them out.
1436 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1437 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1438 jisJapanese.Add (new JISCharacter (cp, jis));
1441 void ParseCJK (string zhXML, string jaXML, string koXML)
1443 XmlDocument doc = new XmlDocument ();
1444 doc.XmlResolver = null;
1451 // Chinese Simplified
1454 offset = 0;//char.MaxValue - arr.Length;
1456 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1458 foreach (char c in s) {
1460 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1462 arr [(int) c - offset] = (ushort) v++;
1468 // Chinese Traditional
1471 offset = 0;//char.MaxValue - arr.Length;
1472 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1474 foreach (char c in s) {
1476 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1478 arr [(int) c - offset] = (ushort) v++;
1487 offset = 0;//char.MaxValue - arr.Length;
1490 arr [0x4EDD] = 0x8002; // Chinese repetition mark?
1491 arr [0x337B] = 0x8004; // Those 4 characters are Gengou
1492 arr [0x337E] = 0x8005;
1493 arr [0x337D] = 0x8006;
1494 arr [0x337C] = 0x8007;
1497 foreach (JISCharacter jc in jisJapanese) {
1498 if (jc.JIS < 0x8800)
1500 char c = (char) jc.CP;
1503 // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1506 arr [(int) c - offset] = (ushort) v++;
1511 if (c == '\u662D') // U+337C
1513 if (c == '\u5927') // U+337D
1515 if (c == '\u5E73') // U+337B
1517 if (c == '\u660E') // U+337E
1519 if (c == '\u9686') // U+F9DC
1522 // FIXME: there are still remaining
1523 // characters after U+FA0C.
1524 // for (int k = 0; k < char.MaxValue; k++) {
1525 for (int k = 0; k < '\uFA0D'; k++) {
1526 if (decompIndex [k] == 0 || IsIgnorable (k))
1528 if (decompValues [decompIndex [k]] == c /*&&
1529 decompLength [k] == 1*/ ||
1530 decompLength [k] == 3 &&
1531 decompValues [decompIndex [k] + 1] == c) {
1532 arr [k - offset] = (ushort) v++;
1541 // Korean weight is somewhat complex. It first shifts
1542 // Hangul category from 52-x to 80-x (they are anyways
1543 // computed). CJK ideographs are placed at secondary
1544 // weight, like XX YY 01 zz 01, where XX and YY are
1545 // corresponding "reset" value and zz is 41,43,45...
1547 // Unlike chs,cht and ja, Korean value is a combined
1548 // ushort which is computed as category
1552 offset = 0;//char.MaxValue - arr.Length;
1554 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1555 XmlElement sc = (XmlElement) reset.NextSibling;
1556 // compute "category" and "level 1" for the
1557 // target "reset" Hangle syllable
1558 char rc = reset.InnerText [0];
1559 int ri = ((int) rc - 0xAC00) + 1;
1561 ((ri / 254) * 256 + (ri % 254) + 2);
1562 // Place the characters after the target.
1565 foreach (char c in s) {
1566 arr [(int) c - offset] = p;
1567 cjkKOlv2 [(int) c - offset] = (byte) v;
1577 void FillIgnorables ()
1579 for (int i = 0; i <= char.MaxValue; i++) {
1580 if (Char.GetUnicodeCategory ((char) i) ==
1581 UnicodeCategory.OtherNotAssigned)
1583 if (IsIgnorable (i))
1584 ignorableFlags [i] |= 1;
1585 if (IsIgnorableSymbol (i))
1586 ignorableFlags [i] |= 2;
1587 if (IsIgnorableNonSpacing (i))
1588 ignorableFlags [i] |= 4;
1592 void ModifyUnidata ()
1594 // Modify some decomposition equivalence
1595 for (int i = 0xFE31; i <= 0xFE34; i++) {
1597 decompIndex [i] = 0;
1598 decompLength [i] = 0;
1600 decompType [0x037E] = 0;
1601 decompIndex [0x037E] = 0;
1602 decompLength [0x037E] = 0;
1605 for (int i = 0x3021; i <= 0x3029; i++)
1606 diacritical [i] = 0x4E;
1607 // Korean parens numbers
1608 for (int i = 0x3200; i <= 0x321C; i++)
1609 diacritical [i] = 0xA;
1610 for (int i = 0x3260; i <= 0x327B; i++)
1611 diacritical [i] = 0xC;
1613 // LAMESPEC: these remapping should not be done.
1614 // Windows have incorrect CJK compat mappings.
1615 decompValues [decompIndex [0x32A9]] = 0x91AB;
1616 decompLength [0x323B] = 1;
1617 decompValues [decompIndex [0x323B]] = 0x5B78;
1618 decompValues [decompIndex [0x32AB]] = 0x5B78;
1619 decompValues [decompIndex [0x32A2]] = 0x5BEB;
1620 decompLength [0x3238] = 1;
1621 decompValues [decompIndex [0x3238]] = 0x52DE;
1622 decompValues [decompIndex [0x3298]] = 0x52DE;
1624 // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
1625 decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
1626 decompValues [decompIndex [0xFA0C]] = 0x5140;
1627 decompLength [0xFA0C] = 1;
1628 decompIndex [0xF929] = decompLength [0xF929] = 0;
1630 decompValues [decompIndex [0xF92C]] = 0x90DE;
1633 void ModifyParsedValues ()
1635 // some cyrillic diacritical weight. They seem to be
1636 // based on old character names, so it's quicker to
1637 // set them directly here.
1638 diacritical [0x0496] = diacritical [0x0497] = 7;
1639 diacritical [0x0498] = diacritical [0x0499] = 0x1A;
1640 diacritical [0x049A] = diacritical [0x049B] = 0x17;
1641 diacritical [0x049C] = diacritical [0x049D] = 9;
1642 diacritical [0x049E] = diacritical [0x049F] = 4;
1643 diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
1644 diacritical [0x04A2] = diacritical [0x04A3] = 7;
1645 diacritical [0x04A4] = diacritical [0x04A5] = 8;
1647 // number, secondary weights
1649 int [] numarr = numberSecondaryWeightBounds;
1650 for (int i = 0; i < numarr.Length; i += 2, weight++)
1651 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1652 if (Char.IsNumber ((char) cp))
1653 diacritical [cp] = weight;
1655 // Update name part of named characters
1656 for (int i = 0; i < sortableCharNames.Count; i++) {
1657 DictionaryEntry de =
1658 (DictionaryEntry) sortableCharNames [i];
1659 int cp = (int) de.Key;
1660 string renamed = null;
1662 case 0x2101: renamed = "A_1"; break;
1663 case 0x33C3: renamed = "A_2"; break;
1664 case 0x2105: renamed = "C_1"; break;
1665 case 0x2106: renamed = "C_2"; break;
1666 case 0x211E: renamed = "R1"; break;
1667 case 0x211F: renamed = "R2"; break;
1668 // Remove some of them!
1679 sortableCharNames.RemoveAt (i);
1683 if (renamed != null)
1684 sortableCharNames [i] =
1685 new DictionaryEntry (cp, renamed);
1689 void GenerateCore ()
1693 #region Specially ignored // 01
1694 // This will raise "Defined" flag up.
1695 // FIXME: Check If it is really fine. Actually for
1696 // Japanese voice marks this code does remapping.
1697 foreach (char c in specialIgnore)
1698 map [(int) c] = new CharMapEntry (0, 0, 0);
1701 #region Extenders (FF FF)
1702 fillIndex [0xFF] = 0xFF;
1703 char [] specialBiggest = new char [] {
1704 '\u3005', '\u3031', '\u3032', '\u309D',
1705 '\u309E', '\u30FC', '\u30FD', '\u30FE',
1706 '\uFE7C', '\uFE7D', '\uFF70'};
1707 foreach (char c in specialBiggest)
1708 AddCharMap (c, 0xFF, 0);
1711 #region Variable weights
1712 // Controls : 06 03 - 06 3D
1713 fillIndex [0x6] = 3;
1714 for (int i = 0; i < 65536; i++) {
1715 if (IsIgnorable (i))
1718 uc = Char.GetUnicodeCategory (c);
1719 // NEL is whitespace but not ignored here.
1720 if (uc == UnicodeCategory.Control &&
1721 !Char.IsWhiteSpace (c) || c == '\u0085')
1722 AddCharMap (c, 6, 1);
1726 fillIndex [0x6] = 0x80;
1727 AddCharMap ('\'', 6, 0);
1728 AddCharMap ('\uFF07', 6, 1);
1729 AddCharMap ('\uFE63', 6, 1);
1731 // SPECIAL CASE: fill FE32 here in prior to be added
1732 // at 2013. Windows does not always respect NFKD.
1733 map [0xFE32] = new CharMapEntry (6, 0x90, 0);
1735 // Hyphen/Dash : 06 81 - 06 90
1736 for (int i = 0; i < char.MaxValue; i++) {
1737 if (!IsIgnorable (i) &&
1738 Char.GetUnicodeCategory ((char) i) ==
1739 UnicodeCategory.DashPunctuation) {
1740 AddCharMapGroup2 ((char) i, 6, 1, 0);
1742 // SPECIAL: add 2027 and 2043
1743 // Maybe they are regarded the
1744 // same hyphens in "central"
1746 AddCharMap ('\u2027', 6, 1);
1747 AddCharMap ('\u2043', 6, 1);
1751 // They are regarded as primarily equivalent to '-'
1752 map [0x208B] = new CharMapEntry (6, 0x82, 0);
1753 map [0x207B] = new CharMapEntry (6, 0x82, 0);
1754 map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
1756 // Arabic variable weight chars 06 A0 -
1757 fillIndex [6] = 0xA0;
1759 for (int i = 0x64B; i <= 0x650; i++)
1760 AddArabicCharMap ((char) i);
1762 AddCharMapGroup ('\u0652', 6, 1, 0);
1764 AddCharMapGroup ('\u0651', 6, 1, 0);
1768 #region Nonspacing marks // 01
1769 // FIXME: 01 03 - 01 B6 ... annoyance :(
1771 // Combining diacritical marks: 01 DC -
1773 fillIndex [0x1] = 0x41;
1774 for (int i = 0x030E; i <= 0x0326; i++)
1775 if (!IsIgnorable (i))
1776 AddCharMap ((char) i, 0x1, 1);
1777 for (int i = 0x0329; i <= 0x0334; i++)
1778 if (!IsIgnorable (i))
1779 AddCharMap ((char) i, 0x1, 1);
1781 for (int i = 0x0339; i <= 0x0341; i++)
1782 if (!IsIgnorable (i))
1783 AddCharMap ((char) i, 0x1, 1);
1784 fillIndex [0x1] = 0x74;
1785 for (int i = 0x0346; i <= 0x0348; i++)
1786 if (!IsIgnorable (i))
1787 AddCharMap ((char) i, 0x1, 1);
1788 for (int i = 0x02BE; i <= 0x02BF; i++)
1789 if (!IsIgnorable (i))
1790 AddCharMap ((char) i, 0x1, 1);
1791 for (int i = 0x02C1; i <= 0x02C5; i++)
1792 if (!IsIgnorable (i))
1793 AddCharMap ((char) i, 0x1, 1);
1794 for (int i = 0x02CE; i <= 0x02CF; i++)
1795 if (!IsIgnorable (i))
1796 AddCharMap ((char) i, 0x1, 1);
1798 for (int i = 0x02D1; i <= 0x02D3; i++)
1799 if (!IsIgnorable (i))
1800 AddCharMap ((char) i, 0x1, 1);
1801 AddCharMap ('\u02DE', 0x1, 1);
1802 for (int i = 0x02E4; i <= 0x02E9; i++)
1803 if (!IsIgnorable (i))
1804 AddCharMap ((char) i, 0x1, 1);
1806 // FIXME: needs more love here (it should eliminate
1807 // all the hacky code above).
1808 for (int i = 0x0300; i < 0x0370; i++)
1809 if (!IsIgnorable (i) && diacritical [i] != 0
1810 /* especiall here*/ && !map [i].Defined)
1811 map [i] = new CharMapEntry (
1812 0x1, 0x1, diacritical [i]);
1814 // Cyrillic and Armenian nonspacing mark
1815 fillIndex [0x1] = 0x94;
1816 for (int i = 0x400; i < 0x580; i++)
1817 if (!IsIgnorable (i) &&
1818 Char.GetUnicodeCategory ((char) i) ==
1819 UnicodeCategory.NonSpacingMark)
1820 AddCharMap ((char) i, 1, 1);
1822 fillIndex [0x1] = 0x8D;
1823 // syriac dotted nonspacing marks (1)
1824 AddCharMap ('\u0740', 0x1, 1);
1825 AddCharMap ('\u0741', 0x1, 1);
1826 AddCharMap ('\u0742', 0x1, 1);
1827 // syriac oblique nonspacing marks
1828 AddCharMap ('\u0747', 0x1, 1);
1829 AddCharMap ('\u0748', 0x1, 1);
1830 // syriac dotted nonspacing marks (2)
1831 fillIndex [0x1] = 0x94; // this reset is mandatory
1832 AddCharMap ('\u0732', 0x1, 1);
1833 AddCharMap ('\u0735', 0x1, 1);
1834 AddCharMap ('\u0738', 0x1, 1);
1835 AddCharMap ('\u0739', 0x1, 1);
1836 AddCharMap ('\u073C', 0x1, 1);
1837 // SPECIAL CASES: superscripts
1838 AddCharMap ('\u073F', 0x1, 1);
1839 AddCharMap ('\u0711', 0x1, 1);
1841 for (int i = 0x0743; i <= 0x0746; i++)
1842 AddCharMap ((char) i, 0x1, 1);
1843 for (int i = 0x0730; i <= 0x0780; i++)
1844 if (!map [i].Defined &&
1845 Char.GetUnicodeCategory ((char) i) ==
1846 UnicodeCategory.NonSpacingMark)
1847 AddCharMap ((char) i, 0x1, 1);
1849 // LAMESPEC: It should not stop at '\u20E1'. There are
1850 // a few more characters (that however results in
1851 // overflow of level 2 unless we start before 0xDD).
1852 fillIndex [0x1] = 0xDD;
1853 for (int i = 0x20D0; i <= 0x20DC; i++)
1854 AddCharMap ((char) i, 0x1, 1);
1855 fillIndex [0x1] = 0xEC;
1856 for (int i = 0x20DD; i <= 0x20E1; i++)
1857 AddCharMap ((char) i, 0x1, 1);
1858 fillIndex [0x1] = 0x7;
1859 for (int i = 0x302A; i <= 0x302D; i++)
1860 AddCharMap ((char) i, 0x1, 1);
1861 fillIndex [0x1] = 0x50; // I wonder how they are sorted
1862 for (int i = 0x02D4; i <= 0x02D7; i++)
1863 AddCharMap ((char) i, 0x1, 1);
1865 // They are not part of Nonspacing marks, but have
1866 // only diacritical weight.
1867 for (int i = 0x3099; i <= 0x309C; i++)
1868 map [i] = new CharMapEntry (1, 1, 1);
1869 map [0xFF9E] = new CharMapEntry (1, 1, 1);
1870 map [0xFF9F] = new CharMapEntry (1, 1, 2);
1871 map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1);
1872 map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1);
1873 for (int i = 0x30FC; i <= 0x30FE; i++)
1874 map [i] = new CharMapEntry (0xFF, 0xFF, 1);
1876 fillIndex [0x1] = 0xA;
1877 for (int i = 0x0951; i <= 0x0954; i++)
1878 AddCharMap ((char) i, 0x1, 2);
1883 #region Whitespaces // 07 03 -
1884 fillIndex [0x7] = 0x2;
1885 AddCharMap (' ', 0x7, 2);
1886 AddCharMap ('\u00A0', 0x7, 1);
1887 for (int i = 9; i <= 0xD; i++)
1888 AddCharMap ((char) i, 0x7, 1);
1889 for (int i = 0x2000; i <= 0x200B; i++)
1890 AddCharMap ((char) i, 0x7, 1);
1892 fillIndex [0x7] = 0x17;
1893 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1894 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1896 // Characters which used to represent layout control.
1897 // LAMESPEC: Windows developers seem to have thought
1898 // that those characters are kind of whitespaces,
1899 // while they aren't.
1900 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1901 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1905 // category 09 - continued symbols from 08
1906 fillIndex [0x9] = 2;
1908 for (int cp = 0x2300; cp <= 0x237A; cp++)
1909 AddCharMap ((char) cp, 0x9, 1, 0);
1912 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
1913 foreach (DictionaryEntry de in arrowValues) {
1914 int idx = (int) de.Value;
1915 int cp = (int) de.Key;
1916 if (map [cp].Defined)
1918 fillIndex [0x9] = (byte) (0xD8 + idx);
1919 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1923 byte [] boxLv2 = new byte [128];
1924 // 0-63 will be used for those offsets are positive,
1925 // and 64-127 are for negative ones.
1926 for (int i = 0; i < boxLv2.Length; i++)
1928 foreach (DictionaryEntry de in boxValues) {
1929 int cp = (int) de.Key;
1930 int off = (int) de.Value;
1931 if (map [cp].Defined)
1934 fillIndex [0x9] = (byte) (0xE5 + off);
1935 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
1938 fillIndex [0x9] = (byte) (0xE5 + off);
1939 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++);
1942 // Some special characters (slanted)
1943 fillIndex [0x9] = 0xF4;
1944 AddCharMap ('\u2571', 0x9, 3);
1945 AddCharMap ('\u2572', 0x9, 3);
1946 AddCharMap ('\u2573', 0x9, 3);
1948 // FIXME: implement 0A
1950 fillIndex [0xA] = 2;
1951 // byte currency symbols
1952 for (int cp = 0; cp < 0x100; cp++) {
1953 uc = Char.GetUnicodeCategory ((char) cp);
1954 if (!IsIgnorable (cp) &&
1955 uc == UnicodeCategory.CurrencySymbol &&
1957 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1959 // byte other symbols
1960 for (int cp = 0; cp < 0x100; cp++) {
1962 continue; // SPECIAL: skip FIXME: why?
1963 uc = Char.GetUnicodeCategory ((char) cp);
1964 if (!IsIgnorable (cp) &&
1965 uc == UnicodeCategory.OtherSymbol ||
1966 cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
1967 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1970 AddCharMapGroup ('\u30FB', 0xA, 1, 0);
1972 for (int cp = 0x2020; cp <= 0x2031; cp++)
1973 if (Char.IsPunctuation ((char) cp))
1974 AddCharMap ((char) cp, 0xA, 1, 0);
1975 // SPECIAL CASES: why?
1976 AddCharMap ('\u203B', 0xA, 1, 0);
1977 AddCharMap ('\u2040', 0xA, 1, 0);
1978 AddCharMap ('\u2041', 0xA, 1, 0);
1979 AddCharMap ('\u2042', 0xA, 1, 0);
1981 for (int cp = 0x20A0; cp <= 0x20AB; cp++)
1982 AddCharMap ((char) cp, 0xA, 1, 0);
1984 // 3004 is skipped at first...
1985 for (int cp = 0x3010; cp <= 0x3040; cp++)
1986 if (Char.IsSymbol ((char) cp))
1987 AddCharMap ((char) cp, 0xA, 1, 0);
1988 // SPECIAL CASES: added here
1989 AddCharMap ('\u3004', 0xA, 1, 0);
1990 AddCharMap ('\u327F', 0xA, 1, 0);
1992 for (int cp = 0x2600; cp <= 0x2613; cp++)
1993 AddCharMap ((char) cp, 0xA, 1, 0);
1995 for (int cp = 0x2620; cp <= 0x2770; cp++)
1996 if (Char.IsSymbol ((char) cp))
1997 AddCharMap ((char) cp, 0xA, 1, 0);
1999 for (int i = 0x2440; i < 0x2460; i++)
2000 AddCharMap ((char) i, 0xA, 1, 0);
2002 // SPECIAL CASES: why?
2003 AddCharMap ('\u0E3F', 0xA, 1, 0);
2004 AddCharMap ('\u2117', 0xA, 1, 0);
2005 AddCharMap ('\u20AC', 0xA, 1, 0);
2008 #region Numbers // 0C 02 - 0C E1
2009 fillIndex [0xC] = 2;
2011 // 9F8 : Bengali "one less than the denominator"
2012 AddCharMap ('\u09F8', 0xC, 1, 0x3C);
2014 ArrayList numbers = new ArrayList ();
2015 for (int i = 0; i < 65536; i++)
2016 if (!IsIgnorable (i) &&
2017 Char.IsNumber ((char) i) &&
2018 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
2021 ArrayList numberValues = new ArrayList ();
2022 foreach (int i in numbers)
2023 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
2024 // SPECIAL CASE: Cyrillic Thousand sign
2025 numberValues.Add (new DictionaryEntry (0x0482, 1000m));
2026 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
2028 //foreach (DictionaryEntry de in numberValues)
2029 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
2031 // FIXME: fillIndex adjustment lines are too
2032 // complicated. It must be simpler.
2033 decimal prevValue = -1;
2034 foreach (DictionaryEntry de in numberValues) {
2035 int cp = (int) de.Key;
2036 decimal currValue = (decimal) de.Value;
2037 bool addnew = false;
2038 if (prevValue < currValue &&
2039 prevValue - (int) prevValue == 0 &&
2043 // Process Hangzhou and Roman numbers
2045 // There are some SPECIAL cases.
2046 if (currValue != 4) // no increment for 4
2050 if (currValue <= 13) {
2054 if (currValue == 11)
2055 AddCharMap ('\u0BF0', 0xC, 1);
2056 xcp = (int) prevValue + 0x2160 - 1;
2057 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2058 xcp = (int) prevValue + 0x2170 - 1;
2059 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2064 if (currValue <= 10) {
2065 xcp = (int) prevValue + 0x3021 - 1;
2066 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2070 if (prevValue < currValue)
2071 prevValue = currValue;
2072 if (map [cp].Defined)
2074 // HangZhou and Roman are add later
2076 if (0x3021 <= cp && cp < 0x302A
2077 || 0x2160 <= cp && cp < 0x216C
2078 || 0x2170 <= cp && cp < 0x217C)
2081 if (cp == 0x215B) // FIXME: why?
2082 fillIndex [0xC] += 2;
2083 else if (cp == 0x3021) // FIXME: why?
2085 if (addnew || cp <= '9') {
2086 int mod = (int) currValue - 1;
2088 if (1 <= currValue && currValue <= 11) {
2090 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2092 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2094 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2096 if (1 <= currValue && currValue <= 20) {
2098 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2100 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2102 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
2105 if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
2107 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
2110 // Maybe Bengali digit numbers do not increase
2111 // indexes, but 0x09E6 does.
2112 case 0x09E7: case 0x09E8: case 0x09E9:
2115 case 0x0BF0: case 0x2180: case 0x2181:
2122 if (currValue < 11 || currValue == 1000)
2127 // Add special cases that are not regarded as
2128 // numbers in UnicodeCategory speak.
2131 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
2132 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
2134 else if (cp == '2' || cp == '6') // FIXME: why?
2139 fillIndex [0xC] = 0xFF;
2140 AddCharMap ('\u221E', 0xC, 1);
2143 #region Letters and NonSpacing Marks (general)
2145 // ASCII Latin alphabets
2146 for (int i = 0; i < alphabets.Length; i++)
2147 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
2149 // non-ASCII Latin alphabets
2150 // FIXME: there is no such characters that are placed
2151 // *after* "alphabets" array items. This is nothing
2152 // more than a hack that creates dummy weight for
2153 // primary characters.
2154 for (int i = 0x0080; i < 0x0300; i++) {
2155 if (!Char.IsLetter ((char) i))
2157 // For those Latin Letters which has NFKD are
2158 // not added as independent primary character.
2159 if (decompIndex [i] != 0)
2162 // 1.some alphabets have primarily
2163 // equivalent ASCII alphabets.
2164 // 2.some have independent primary weights,
2165 // but inside a-to-z range.
2166 // 3.there are some expanded characters that
2167 // are not part of Unicode Standard NFKD.
2168 // 4. some characters are letter in IsLetter
2169 // but not in sortkeys (maybe unicode version
2170 // difference caused it).
2172 // 1. skipping them does not make sense
2173 // case 0xD0: case 0xF0: case 0x131: case 0x138:
2174 // case 0x184: case 0x185: case 0x186: case 0x189:
2175 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
2176 // case 0x194: case 0x195: case 0x196: case 0x19A:
2177 // case 0x19B: case 0x19C:
2178 // 2. skipping them does not make sense
2179 // case 0x14A: // Ng
2180 // case 0x14B: // ng
2184 case 0xDE: // Icelandic Thorn
2185 case 0xFE: // Icelandic Thorn
2186 case 0xDF: // German ss
2187 case 0xFF: // German ss
2189 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
2190 // not classified yet
2191 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
2192 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
2193 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
2197 AddCharMapGroup ((char) i, 0xE, 1, 0);
2201 fillIndex [0xF] = 02;
2202 for (int i = 0x0380; i < 0x0390; i++)
2203 if (Char.IsLetter ((char) i))
2204 AddLetterMap ((char) i, 0xF, 1);
2205 fillIndex [0xF] = 02;
2206 for (int i = 0x0391; i < 0x03CF; i++)
2207 if (Char.IsLetter ((char) i))
2208 AddLetterMap ((char) i, 0xF, 1);
2209 fillIndex [0xF] = 0x40;
2210 for (int i = 0x03D0; i < 0x0400; i++)
2211 if (Char.IsLetter ((char) i))
2212 AddLetterMap ((char) i, 0xF, 1);
2215 // Cyrillic letters are sorted like Latin letters i.e.
2216 // containing culture-specific letters between the
2217 // standard Cyrillic sequence.
2219 // We can't use UCA here; it has different sorting.
2220 char [] orderedCyrillic = new char [] {
2221 '\u0430', '\u0431', '\u0432', '\u0433', '\u0434',
2222 '\u0452', // DJE for Serbocroatian
2224 '\u0454', // IE for Ukrainian
2228 '\u0456', // Byelorussian-Ukrainian I
2238 '\u043F', '\u0440', '\u0441', '\u0442',
2239 '\u045B', // TSHE for Serbocroatian
2241 '\u045E', // Short U for Byelorussian
2242 '\u04B1', // Straight U w/ stroke (diacritical!)
2243 '\u0444', '\u0445', '\u0446', '\u0447',
2245 '\u0448', '\u0449', '\u044A', '\u044B', '\u044C',
2246 '\u044D', '\u044E', '\u044F'};
2248 // For some characters here is a map to basic cyrillic
2249 // letters. See UnicodeData.txt character names for
2250 // the sources. Here I simply declare an equiv. array.
2251 // The content characters are map from U+490(,491),
2252 // skipping small letters.
2253 char [] cymap_src = new char [] {
2254 '\u0433', '\u0433', '\u0433', '\u0436',
2255 '\u0437', '\u043A', '\u043A', '\u043A',
2256 '\u043A', '\u043D', '\u043D', '\u043F',
2257 '\u0445', '\u0441', '\u0442', '\u0443',
2258 '\u0443', '\u0445', '\u0446', '\u0447',
2259 '\u0447', '\u0432', '\u0435', '\u0435',
2260 '\u0406', '\u0436', '\u043A', '\u043D',
2261 '\u0447', '\u0435'};
2263 fillIndex [0x10] = 0x8D;
2264 for (int i = 0x0460; i < 0x0481; i++) {
2265 if (Char.IsLetter ((char) i)) {
2267 // U+476/477 have the same
2268 // primary weight as U+474/475.
2269 fillIndex [0x10] -= 3;
2270 AddLetterMap ((char) i, 0x10, 3);
2274 fillIndex [0x10] = 0x6;
2275 for (int i = 0; i < orderedCyrillic.Length; i++) {
2276 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
2277 if (!IsIgnorable ((int) c) &&
2278 Char.IsLetter (c) &&
2280 AddLetterMap (c, 0x10, 0);
2281 fillIndex [0x10] += 3;
2285 for (int i = 0; i < cymap_src.Length; i++) {
2286 char c = cymap_src [i];
2287 fillIndex [0x10] = map [c].Level1;
2288 int c2 = 0x0490 + i * 2;
2289 AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
2293 fillIndex [0x11] = 0x3;
2294 fillIndex [0x1] = 0x98;
2295 for (int i = 0x0531; i < 0x0586; i++) {
2296 if (i == 0x0559 || i == 0x55A)
2297 AddCharMap ((char) i, 1, 1);
2298 if (Char.IsLetter ((char) i))
2299 AddLetterMap ((char) i, 0x11, 1);
2304 fillIndex [0x12] = 0x2;
2305 for (int i = 0x05D0; i < 0x05FF; i++)
2306 if (Char.IsLetter ((char) i))
2307 AddLetterMap ((char) i, 0x12, 1);
2309 fillIndex [0x1] = 0x3;
2310 for (int i = 0x0591; i <= 0x05C2; i++) {
2311 if (i == 0x05A3 || i == 0x05BB)
2314 AddCharMap ((char) i, 0x1, 1);
2318 fillIndex [0x1] = 0x8E;
2319 fillIndex [0x13] = 0x3;
2320 for (int i = 0x0621; i <= 0x064A; i++) {
2322 if (Char.GetUnicodeCategory ((char) i)
2323 != UnicodeCategory.OtherLetter) {
2324 // FIXME: arabic nonspacing marks are
2325 // in different order.
2326 AddCharMap ((char) i, 0x1, 1);
2329 // map [i] = new CharMapEntry (0x13,
2330 // (byte) arabicLetterPrimaryValues [i], 1);
2332 (byte) arabicLetterPrimaryValues [i];
2333 byte formDiacritical = 8; // default
2336 case 0x0622: formDiacritical = 9; break;
2337 case 0x0623: formDiacritical = 0xA; break;
2338 case 0x0624: formDiacritical = 5; break;
2339 case 0x0625: formDiacritical = 0xB; break;
2340 case 0x0626: formDiacritical = 7; break;
2341 case 0x0649: formDiacritical = 5; break;
2342 case 0x064A: formDiacritical = 7; break;
2344 AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
2346 for (int i = 0x0670; i < 0x0673; i++)
2347 map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
2348 fillIndex [0x13] = 0x84;
2349 for (int i = 0x0674; i < 0x06D6; i++)
2350 if (Char.IsLetter ((char) i))
2351 AddLetterMapCore ((char) i, 0x13, 1, 0, false);
2355 // FIXME: this could be fixed in more decent way
2356 for (int i = 0x0958; i <= 0x095F; i++)
2357 diacritical [i] = 8;
2359 // FIXME: it does seem straight codepoint mapping.
2360 fillIndex [0x14] = 04;
2361 for (int i = 0x0901; i < 0x0905; i++)
2362 if (!IsIgnorable (i))
2363 AddLetterMap ((char) i, 0x14, 2);
2364 fillIndex [0x14] = 0xB;
2365 for (int i = 0x0905; i < 0x093A; i++) {
2367 AddCharMap ('\u0929', 0x14, 0, 8);
2369 AddCharMap ('\u0931', 0x14, 0, 8);
2371 AddCharMap ('\u0934', 0x14, 0, 8);
2372 if (Char.IsLetter ((char) i))
2373 AddLetterMap ((char) i, 0x14, 4);
2375 AddCharMap ('\u0960', 0x14, 4);
2377 AddCharMap ('\u0961', 0x14, 4);
2379 fillIndex [0x14] = 0xDA;
2380 for (int i = 0x093E; i < 0x0945; i++)
2381 if (!IsIgnorable (i))
2382 AddLetterMap ((char) i, 0x14, 2);
2383 fillIndex [0x14] = 0xEC;
2384 for (int i = 0x0945; i < 0x094F; i++)
2385 if (!IsIgnorable (i))
2386 AddLetterMap ((char) i, 0x14, 2);
2390 fillIndex [0x15] = 02;
2391 for (int i = 0x0980; i < 0x9FF; i++) {
2392 if (IsIgnorable (i))
2395 fillIndex [0x15] = 0x3B;
2396 switch (Char.GetUnicodeCategory ((char) i)) {
2397 case UnicodeCategory.NonSpacingMark:
2398 case UnicodeCategory.DecimalDigitNumber:
2399 case UnicodeCategory.OtherNumber:
2402 AddLetterMap ((char) i, 0x15, 1);
2405 fillIndex [0x1] = 0x3;
2406 for (int i = 0x0981; i < 0x0A00; i++)
2407 if (Char.GetUnicodeCategory ((char) i) ==
2408 UnicodeCategory.NonSpacingMark)
2409 AddCharMap ((char) i, 0x1, 1);
2411 // Gurmukhi. orderedGurmukhi is from UCA
2412 // FIXME: it does not look equivalent to UCA.
2413 fillIndex [0x16] = 04;
2414 fillIndex [0x1] = 3;
2415 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2416 char c = orderedGurmukhi [i];
2417 if (IsIgnorable ((int) c))
2419 if (IsIgnorableNonSpacing (c)) {
2420 AddLetterMap (c, 0x1, 1);
2423 if (c == '\u0A3C' || c == '\u0A4D' ||
2424 '\u0A66' <= c && c <= '\u0A71')
2429 case '\u0A33': case '\u0A36': case '\u0A16':
2430 case '\u0A17': case '\u0A5B': case '\u0A5E':
2434 if (c == '\u0A3E') // Skip
2435 fillIndex [0x16] = 0xC0;
2436 AddLetterMap (c, 0x16, shift);
2439 // Gujarati. orderedGujarati is from UCA
2440 fillIndex [0x17] = 0x4;
2442 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2443 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2444 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2445 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2446 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2447 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2448 // letters go first.
2449 for (int i = 0; i < orderedGujarati.Length; i++) {
2451 char c = orderedGujarati [i];
2452 if (Char.IsLetter (c)) {
2454 if (c == '\u0AB3' || c == '\u0A32')
2456 if (c == '\u0A33') {
2457 AddCharMap ('\u0A32', 0x17, 0);
2458 AddCharMap ('\u0A33', 0x17, 4, 4);
2462 AddCharMap ('\u0AE0', 0x17, 0, 5);
2463 AddCharMap (c, 0x17, 4);
2466 AddCharMap ('\u0AB3', 0x17, 6);
2470 byte gujaratiShift = 4;
2471 fillIndex [0x17] = 0xC0;
2472 for (int i = 0; i < orderedGujarati.Length; i++) {
2473 char c = orderedGujarati [i];
2474 if (fillIndex [0x17] == 0xCC)
2476 if (!Char.IsLetter (c)) {
2479 AddCharMap ('\u0A81', 0x17, 2);
2482 AddLetterMap (c, 0x17, gujaratiShift);
2487 fillIndex [0x1] = 03;
2488 fillIndex [0x18] = 02;
2489 for (int i = 0x0B00; i < 0x0B7F; i++) {
2490 switch (Char.GetUnicodeCategory ((char) i)) {
2491 case UnicodeCategory.NonSpacingMark:
2492 case UnicodeCategory.DecimalDigitNumber:
2493 AddLetterMap ((char) i, 0x1, 1);
2496 AddLetterMap ((char) i, 0x18, 1);
2500 fillIndex [0x19] = 2;
2501 AddCharMap ('\u0BD7', 0x19, 0);
2502 fillIndex [0x19] = 0xA;
2504 for (int i = 0x0B82; i <= 0x0B94; i++)
2505 if (!IsIgnorable ((char) i))
2506 AddCharMap ((char) i, 0x19, 2);
2508 fillIndex [0x19] = 0x28;
2509 // The array for Tamil consonants is a constant.
2510 // Windows have almost similar sequence to TAM from
2511 // tamilnet but a bit different in Grantha.
2512 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2513 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2515 fillIndex [0x19] = 0x82;
2516 for (int i = 0x0BBE; i < 0x0BCD; i++)
2517 if (Char.GetUnicodeCategory ((char) i) ==
2518 UnicodeCategory.SpacingCombiningMark
2520 AddLetterMap ((char) i, 0x19, 2);
2523 fillIndex [0x1A] = 0x4;
2524 for (int i = 0x0C00; i < 0x0C62; i++) {
2525 if (i == 0x0C55 || i == 0x0C56)
2527 AddCharMap ((char) i, 0x1A, 3);
2528 char supp = (i == 0x0C0B) ? '\u0C60':
2529 i == 0x0C0C ? '\u0C61' : char.MinValue;
2530 if (supp == char.MinValue)
2532 AddCharMap (supp, 0x1A, 3);
2536 fillIndex [0x1B] = 4;
2537 for (int i = 0x0C80; i < 0x0CE5; i++) {
2538 if (i == 0x0CD5 || i == 0x0CD6)
2540 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2541 continue; // shift after 0xCB9
2542 AddCharMap ((char) i, 0x1B, 3);
2544 // SPECIAL CASES: but why?
2545 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2546 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2547 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2550 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2554 fillIndex [0x1C] = 2;
2555 fillIndex [0x1] = 3;
2556 for (int i = 0x0D02; i < 0x0D61; i++) {
2557 // FIXME: I avoided MSCompatUnicodeTable usage
2558 // here (it results in recursion). So check if
2559 // using NonSpacingMark makes sense or not.
2560 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2561 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2562 AddCharMap ((char) i, 0x1C, 1);
2563 else if (!IsIgnorable ((char) i))
2564 AddCharMap ((char) i, 1, 1);
2567 // Thai ... note that it breaks 0x1E wall after E2B!
2568 // Also, all Thai characters have level 2 value 3.
2569 fillIndex [0x1E] = 2;
2570 fillIndex [0x1] = 3;
2571 for (int i = 0xE40; i <= 0xE44; i++)
2572 AddCharMap ((char) i, 0x1E, 1, 3);
2573 for (int i = 0xE01; i < 0xE2B; i++)
2574 AddCharMap ((char) i, 0x1E, 6, 3);
2575 fillIndex [0x1F] = 5;
2576 for (int i = 0xE2B; i < 0xE30; i++)
2577 AddCharMap ((char) i, 0x1F, 6, 3);
2578 fillIndex [0x1F] = 0x1E;
2579 for (int i = 0xE30; i < 0xE3B; i++)
2580 AddCharMap ((char) i, 0x1F, 1, 3);
2581 // some Thai characters remains.
2582 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2583 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2584 foreach (char c in specialThai)
2585 AddCharMap (c, 0x1F, 1, 3);
2587 for (int i = 0xE00; i < 0xE80; i++)
2588 if (Char.GetUnicodeCategory ((char) i) ==
2589 UnicodeCategory.NonSpacingMark)
2590 AddCharMap ((char) i, 1, 1);
2593 fillIndex [0x1F] = 2;
2594 fillIndex [0x1] = 3;
2595 for (int i = 0xE80; i < 0xEDF; i++) {
2596 if (IsIgnorable ((char) i))
2598 else if (Char.IsLetter ((char) i))
2599 AddCharMap ((char) i, 0x1F, 1);
2600 else if (Char.GetUnicodeCategory ((char) i) ==
2601 UnicodeCategory.NonSpacingMark)
2602 AddCharMap ((char) i, 1, 1);
2605 // Georgian. orderedGeorgian is from UCA DUCET.
2606 fillIndex [0x21] = 5;
2607 for (int i = 0; i < orderedGeorgian.Length; i++) {
2608 char c = orderedGeorgian [i];
2609 if (map [(int) c].Defined)
2611 AddCharMap (c, 0x21, 0);
2613 AddCharMap ((char) (c - 0x30), 0x21, 0);
2614 fillIndex [0x21] += 5;
2618 fillIndex [0x22] = 2;
2619 int kanaOffset = 0x3041;
2620 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2622 for (int gyo = 0; gyo < 9; gyo++) {
2623 for (int dan = 0; dan < 5; dan++) {
2624 if (gyo == 7 && dan % 2 == 1) {
2627 kanaOffset -= 2; // There is no space for yi and ye.
2630 int cp = kanaOffset + dan * kanaLines [gyo];
2631 // small lines (a-gyo, ya-gyo)
2632 if (gyo == 0 || gyo == 7) {
2633 AddKanaMap (cp, 1); // small
2634 AddKanaMap (cp + 1, 1);
2637 AddKanaMap (cp, kanaLines [gyo]);
2641 // add small 'ka' (before normal one)
2642 AddKanaMap (0x30F5, 1);
2646 // add small 'ke' (before normal one)
2647 AddKanaMap (0x30F6, 1);
2651 // add small 'Tsu' (before normal one)
2652 AddKanaMap (0x3063, 1);
2656 fillIndex [0x22] += 3;
2657 kanaOffset += 5 * kanaLines [gyo];
2660 // Wa-gyo is almost special, so I just manually add.
2661 AddLetterMap ((char) 0x308E, 0x22, 0);
2662 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2663 AddLetterMap ((char) 0x308F, 0x22, 0);
2664 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2666 AddLetterMap ((char) 0x3090, 0x22, 0);
2667 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2668 fillIndex [0x22] += 2;
2669 // no "Wu" in Japanese.
2670 AddLetterMap ((char) 0x3091, 0x22, 0);
2671 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2673 AddLetterMap ((char) 0x3092, 0x22, 0);
2674 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2676 fillIndex [0x22] = 0x80;
2677 AddLetterMap ((char) 0x3093, 0x22, 0);
2678 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2680 map [0x3094] = new CharMapEntry (map [0x30A6].Category,
2681 map [0x30A6].Level1, 3);// voiced hiragana U
2682 map [0x30F4] = new CharMapEntry (map [0x30A6].Category,
2683 map [0x30A6].Level1, 3);// voiced katakana U
2685 map [0x30F5] = new CharMapEntry (map [0x30AB].Category,
2686 map [0x30AB].Level1, 0);// small katakana Ka
2687 map [0x30F6] = new CharMapEntry (map [0x30B1].Category,
2688 map [0x30B1].Level1, 0);// small katakana Ke
2690 for (int i = 0x30F7; i < 0x30FB; i++)
2691 map [i] = new CharMapEntry (map [i - 8].Category,
2695 // JIS Japanese square chars.
2696 fillIndex [0x22] = 0x97;
2697 jisJapanese.Sort (JISComparer.Instance);
2698 foreach (JISCharacter j in jisJapanese)
2699 if (0x3300 <= j.CP && j.CP <= 0x3357)
2700 AddCharMap ((char) j.CP, 0x22, 1);
2701 // non-JIS Japanese square chars.
2702 nonJisJapanese.Sort (NonJISComparer.Instance);
2703 foreach (NonJISCharacter j in nonJisJapanese)
2704 AddCharMap ((char) j.CP, 0x22, 1);
2707 fillIndex [0x23] = 0x02;
2708 for (int i = 0x3105; i <= 0x312C; i++)
2709 AddCharMap ((char) i, 0x23, 1);
2711 // Estrangela: ancient Syriac
2712 fillIndex [0x24] = 0x0B;
2713 // FIXME: is 0x71E really alternative form?
2714 ArrayList syriacAlternatives = new ArrayList (
2715 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2716 for (int i = 0x0710; i <= 0x072C; i++) {
2717 if (i == 0x0711) // NonSpacingMark
2719 if (syriacAlternatives.Contains (i))
2721 AddCharMap ((char) i, 0x24, 4);
2726 foreach (int cp in syriacAlternatives)
2727 map [cp] = new CharMapEntry (0x24,
2728 (byte) (map [cp - 1].Level1 + 2),
2730 // FIXME: Syriac NonSpacingMark should go here.
2733 // FIXME: it turned out that it does not look like UCA
2734 fillIndex [0x24] = 0x6E;
2735 fillIndex [0x1] = 0xAC;
2736 for (int i = 0; i < orderedThaana.Length; i++) {
2737 char c = orderedThaana [i];
2738 if (IsIgnorableNonSpacing ((int) c))
2739 AddCharMap (c, 1, 1);
2740 AddCharMap (c, 0x24, 2);
2741 if (c == '\u0782') // SPECIAL CASE: why?
2742 fillIndex [0x24] += 2;
2746 // FIXME: Add more culture-specific letters (that are
2747 // not supported in Windows collation) here.
2749 // Surrogate ... they are computed.
2754 // Unlike UCA Windows Hangul sequence mixes Jongseong
2755 // with Choseong sequence as well as Jungseong,
2756 // adjusted to have the same primary weight for the
2757 // same base character. So it is impossible to compute
2760 // Here I introduce an ordered sequence of mixed
2761 // 'commands' and 'characters' that is similar to
2763 // - ',' increases primary weight.
2764 // - [A B] means a range, increasing index
2765 // - {A B} means a range, without increasing index
2766 // - '=' is no operation (it means the characters
2767 // of both sides have the same weight).
2768 // - '>' inserts a Hangul Syllable block that
2769 // contains 0x251 characters.
2770 // - '<' decreases the index
2771 // - '0'-'9' means skip count
2772 // - whitespaces are ignored
2775 string hangulSequence =
2776 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2777 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2778 + "<{\u1113 \u1116}, \u3165,"
2779 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2780 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2781 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
2782 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2783 + "[\u11D1 \u11D2], \u11B2,"
2784 + "[\u11D3 \u11D5], \u11B3,"
2785 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2786 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2787 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2788 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2789 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2790 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2791 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2792 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2793 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2794 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2795 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2796 + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
2797 + "\u11F1,, \u11F2,,,"
2798 + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
2799 + "<\u114D, \u110D,, >"
2800 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2801 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2802 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2803 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2804 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2808 byte hangulCat = 0x52;
2809 fillIndex [hangulCat] = 0x2;
2811 int syllableBlock = 0;
2812 for (int n = 0; n < hangulSequence.Length; n++) {
2813 char c = hangulSequence [n];
2815 if (Char.IsWhiteSpace (c))
2821 IncrementSequentialIndex (ref hangulCat);
2824 if (fillIndex [hangulCat] == 2)
2825 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2826 fillIndex [hangulCat]--;
2829 IncrementSequentialIndex (ref hangulCat);
2830 for (int l = 0; l < 0x15; l++)
2831 for (int v = 0; v < 0x1C; v++) {
2833 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2834 IncrementSequentialIndex (ref hangulCat);
2839 start = hangulSequence [n + 1];
2840 end = hangulSequence [n + 3];
2841 for (int i = start; i <= end; i++) {
2842 AddCharMap ((char) i, hangulCat, 0);
2844 IncrementSequentialIndex (ref hangulCat);
2846 n += 4; // consumes 5 characters for this operation
2849 start = hangulSequence [n + 1];
2850 end = hangulSequence [n + 3];
2851 for (int i = start; i <= end; i++)
2852 AddCharMap ((char) i, hangulCat, 0);
2853 n += 4; // consumes 5 characters for this operation
2856 AddCharMap (c, hangulCat, 0);
2862 for (int i = 0x3200; i < 0x3300; i++) {
2863 if (IsIgnorable (i) || map [i].Defined)
2867 if (decompLength [i] == 4 &&
2868 decompValues [decompIndex [i]] == '(')
2869 ch = decompIndex [i] + 1;
2871 else if (decompLength [i] == 2 &&
2872 decompValues [decompIndex [i] + 1] == '\u1161')
2873 ch = decompIndex [i];
2874 else if (decompLength [i] == 1)
2875 ch = decompIndex [i];
2878 ch = decompValues [ch];
2879 if (ch < 0x1100 || 0x1200 < ch &&
2880 ch < 0xAC00 || 0xD800 < ch)
2884 int offset = i < 0x3260 ? 1 : 0;
2885 if (0x326E <= i && i <= 0x3273)
2888 map [i] = new CharMapEntry (map [ch].Category,
2889 (byte) (map [ch].Level1 + offset),
2891 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2897 // Letterlike characters and CJK compatibility square
2898 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2899 int [] counts = new int ['Z' - 'A' + 1];
2900 char [] namedChars = new char [sortableCharNames.Count];
2902 foreach (DictionaryEntry de in sortableCharNames) {
2903 counts [((string) de.Value) [0] - 'A']++;
2904 namedChars [nCharNames++] = (char) ((int) de.Key);
2906 nCharNames = 0; // reset
2907 for (int a = 0; a < counts.Length; a++) {
2908 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2909 for (int i = 0; i < counts [a]; i++)
2910 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2911 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2914 // CJK unified ideograph.
2916 fillIndex [cjkCat] = 0x2;
2917 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2918 if (!IsIgnorable (cp))
2919 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2920 // CJK Extensions goes here.
2921 // LAMESPEC: With this Windows style CJK layout, it is
2922 // impossible to add more CJK ideograph i.e. 0x9FA6-
2923 // 0x9FBB can never be added w/o breaking compat.
2924 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2925 if (!IsIgnorable (cp))
2926 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2928 // PrivateUse ... computed.
2929 // remaining Surrogate ... computed.
2931 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2932 // non-alphanumeric ASCII except for: + - < = > '
2933 for (int i = 0x21; i < 0x7F; i++) {
2934 // SPECIAL CASE: 02C6 looks regarded as
2935 // equivalent to '^', which does not conform
2936 // to Unicode standard character database.
2938 AddCharMap ('\u2045', 0x7, 0, 0x1C);
2940 AddCharMap ('\u2046', 0x7, 0, 0x1C);
2942 AddCharMap ('\u02C6', 0x7, 0, 3);
2944 AddCharMap ('\u02CB', 0x7, 0, 3);
2946 if (Char.IsLetterOrDigit ((char) i)
2947 || "+-<=>'".IndexOf ((char) i) >= 0)
2948 continue; // they are not added here.
2950 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2951 // Insert 3001 after ',' and 3002 after '.'
2953 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2955 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2957 AddCharMap ('\uFE30', 0x7, 1, 0);
2961 #region 07 - Punctuations and something else
2962 for (int i = 0xA0; i < char.MaxValue; i++) {
2963 if (IsIgnorable (i))
2966 // FIXME: actually those reset should not be
2967 // done but here I put for easy goal.
2971 fillIndex [0x7] = 0xE2;
2973 fillIndex [0x7] = 0x77;
2975 fillIndex [0x7] = 0x93;
2977 if (0x02C8 <= i && i <= 0x02CD)
2978 continue; // nonspacing marks
2980 // SPECIAL CASE: maybe they could be allocated
2981 // dummy NFKD mapping and no special processing
2982 // would be required here.
2984 AddCharMap ('\u02C9', 0x7, 0, 3);
2986 AddCharMap ('\u02CA', 0x7, 0, 3);
2988 AddCharMap ('\u02D8', 0x7, 0, 3);
3002 switch (Char.GetUnicodeCategory ((char) i)) {
3003 case UnicodeCategory.OtherPunctuation:
3004 case UnicodeCategory.ClosePunctuation:
3005 case UnicodeCategory.OpenPunctuation:
3006 case UnicodeCategory.ConnectorPunctuation:
3007 case UnicodeCategory.InitialQuotePunctuation:
3008 case UnicodeCategory.FinalQuotePunctuation:
3009 case UnicodeCategory.ModifierSymbol:
3010 // SPECIAL CASES: // 0xA
3011 if (0x2020 <= i && i <= 0x2031)
3013 if (i == 0x3003) // added later
3015 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
3018 if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
3019 goto case UnicodeCategory.OtherPunctuation;
3025 // FIXME: it should not need to reset level 1, but
3026 // it's for easy goal.
3027 fillIndex [0x7] = 0xB6;
3028 for (int i = 0x2400; i <= 0x2424; i++)
3029 AddCharMap ((char) i, 0x7, 1, 0);
3031 // FIXME: what are they?
3032 AddCharMap ('\u3003', 0x7, 1);
3033 AddCharMap ('\u3006', 0x7, 1);
3034 AddCharMap ('\u02D0', 0x7, 1);
3035 AddCharMap ('\u10FB', 0x7, 1);
3036 AddCharMap ('\u0950', 0x7, 1);
3037 AddCharMap ('\u093D', 0x7, 1);
3038 AddCharMap ('\u0964', 0x7, 1);
3039 AddCharMap ('\u0965', 0x7, 1);
3040 AddCharMap ('\u0970', 0x7, 1);
3044 #region category 08 - symbols
3045 fillIndex [0x8] = 2;
3046 // Here Windows mapping is not straightforward. It is
3047 // not based on computation but seems manual sorting.
3048 AddCharMapGroup ('+', 0x8, 1, 0); // plus
3049 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
3050 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
3051 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
3052 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
3053 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
3054 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
3055 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
3056 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
3057 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
3058 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
3059 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
3060 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
3062 for (int cp = 0; cp < 0x2300; cp++) {
3063 if (cp == 0xAC) // SPECIAL CASE: skip
3066 cp = 0x2200; // skip to 2200
3067 fillIndex [0x8] = 0x21;
3070 fillIndex [0x8] = 0x3;
3072 fillIndex [0x8] = 0xAB;
3074 fillIndex [0x8] = 0xB9;
3075 if (!map [cp].Defined &&
3076 // Char.GetUnicodeCategory ((char) cp) ==
3077 // UnicodeCategory.MathSymbol)
3078 Char.IsSymbol ((char) cp))
3079 AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
3080 // SPECIAL CASES: no idea why Windows sorts as such
3083 AddCharMap ('\u227B', 0x8, 1, 0);
3084 AddCharMap ('\u22B1', 0x8, 1, 0);
3087 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
3088 AddCharMapGroup ('\u226A', 0x8, 1, 0);
3089 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
3090 AddCharMapGroup ('\u226B', 0x8, 1, 0);
3093 AddCharMap ('\u01C0', 0x8, 1, 0);
3094 AddCharMap ('\u01C1', 0x8, 1, 0);
3095 AddCharMap ('\u01C2', 0x8, 1, 0);
3103 // Characters w/ diacritical marks (NFKD)
3104 for (int i = 0; i <= char.MaxValue; i++) {
3105 if (map [i].Defined || IsIgnorable (i))
3107 if (decompIndex [i] == 0)
3110 int start = decompIndex [i];
3111 int primaryChar = decompValues [start];
3112 int secondary = diacritical [i];
3114 int length = decompLength [i];
3115 // special processing for parenthesized ones.
3117 decompValues [start] == '(' &&
3118 decompValues [start + 2] == ')') {
3119 primaryChar = decompValues [start + 1];
3123 if (map [primaryChar].Level1 == 0)
3126 for (int l = 1; l < length; l++) {
3127 int c = decompValues [start + l];
3128 if (map [c].Level1 != 0)
3130 secondary += diacritical [c];
3134 map [i] = new CharMapEntry (
3135 map [primaryChar].Category,
3136 map [primaryChar].Level1,
3141 // Diacritical weight adjustment
3144 diacritical [0x624] = 0x5;
3145 diacritical [0x626] = 0x7;
3146 diacritical [0x622] = 0x9;
3147 diacritical [0x623] = 0xA;
3148 diacritical [0x625] = 0xB;
3149 diacritical [0x649] = 0x5; // 'alif maqs.uurah
3150 diacritical [0x64A] = 0x7; // Yaa'
3152 for (int i = 0; i < char.MaxValue; i++) {
3154 byte cat = map [i].Category;
3156 case 0xE: // Latin diacritics
3157 case 0x22: // Japanese: circled characters
3158 mod = diacritical [i];
3160 case 0x13: // Arabic
3161 if (diacritical [i] == 0 && i >= 0xFE8D)
3162 mod = 0x8; // default for arabic
3165 if (0x52 <= cat && cat <= 0x7F) // Hangul
3166 mod = diacritical [i];
3168 map [i] = new CharMapEntry (
3169 cat, map [i].Level1, mod);
3172 // FIXME: this is halfly hack but those NonSpacingMark
3173 // characters and still undefined are likely to
3175 for (int i = 0; i < char.MaxValue; i++) {
3176 if (map [i].Defined ||
3185 if (Char.GetUnicodeCategory ((char) i) !=
3186 UnicodeCategory.NonSpacingMark)
3190 if (diacritical [i] != 0)
3191 map [i] = new CharMapEntry (1, 1, diacritical [i]);
3193 AddCharMap ((char) i, 1, 1);
3199 private void IncrementSequentialIndex (ref byte hangulCat)
3201 fillIndex [hangulCat]++;
3202 if (fillIndex [hangulCat] == 0) { // overflown
3204 fillIndex [hangulCat] = 0x2;
3208 // Reset fillIndex to fixed value and call AddLetterMap().
3209 private void AddAlphaMap (char c, byte category, byte alphaWeight)
3211 fillIndex [category] = alphaWeight;
3212 AddLetterMap (c, category, 0);
3214 ArrayList al = latinMap [c] as ArrayList;
3218 foreach (int cp in al)
3219 AddLetterMap ((char) cp, category, 0);
3222 private void AddKanaMap (int i, byte voices)
3224 for (byte b = 0; b < voices; b++) {
3225 char c = (char) (i + b);
3226 byte arg = (byte) (b > 0 ? b + 2 : 0);
3228 AddLetterMapCore (c, 0x22, 0, arg, false);
3230 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
3234 private void AddLetterMap (char c, byte category, byte updateCount)
3236 AddLetterMapCore (c, category, updateCount, 0, true);
3239 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3242 // <small> updates index
3243 c2 = ToSmallForm (c);
3245 AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
3246 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
3247 if (c2 != c && !map [(int) c2].Defined)
3248 AddLetterMapCore (c2, category, 0, level2, deferLevel2);
3249 bool doUpdate = true;
3250 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3253 AddCharMapGroup (c, category, 0, level2, deferLevel2);
3255 fillIndex [category] += updateCount;
3258 private bool AddCharMap (char c, byte category, byte increment)
3260 return AddCharMap (c, category, increment, 0);
3263 private bool AddCharMap (char c, byte category, byte increment, byte alt)
3265 if (IsIgnorable ((int) c) || map [(int) c].Defined)
3266 return false; // do nothing
3267 map [(int) c] = new CharMapEntry (category,
3268 category == 1 ? alt : fillIndex [category],
3269 category == 1 ? fillIndex [category] : alt);
3270 fillIndex [category] += increment;
3275 // Adds characters to table in the order below
3276 // (+ increases weight):
3280 // <full> | <super> | <sub>
3281 // <circle> | <wide> (| <narrow>)
3285 // level2 is fixed (does not increase).
3286 int [] sameWeightItems = new int [] {
3287 DecompositionFraction,
3291 DecompositionCircle,
3293 DecompositionNarrow,
3295 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
3297 AddCharMapGroup (c, category, updateCount, level2, false);
3300 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
3302 if (map [(int) c].Defined)
3306 level2 = diacritical [(int) c];
3308 char small = char.MinValue;
3309 char vertical = char.MinValue;
3310 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3312 object smv = nfkd [(byte) DecompositionSmall];
3314 small = (char) ((int) smv);
3315 object vv = nfkd [(byte) DecompositionVertical];
3317 vertical = (char) ((int) vv);
3320 // <small> updates index
3321 if (small != char.MinValue) {
3322 if (level2 == 0 && deferLevel2)
3323 level2 = diacritical [small];
3324 AddCharMap (small, category, updateCount, level2);
3328 AddCharMap (c, category, 0, level2);
3331 foreach (int weight in sameWeightItems) {
3332 object wv = nfkd [(byte) weight];
3335 level2 = diacritical [(int) wv];
3336 AddCharMap ((char) ((int) wv), category, 0, level2);
3341 // update index here.
3342 fillIndex [category] += updateCount;
3344 if (vertical != char.MinValue) {
3345 if (level2 == 0 && deferLevel2)
3346 level2 = diacritical [vertical];
3347 AddCharMap (vertical, category, updateCount, level2);
3351 private void AddCharMapCJK (char c, ref byte category)
3353 AddCharMap (c, category, 0, 0);
3354 IncrementSequentialIndex (ref category);
3356 // Special. I wonder why but Windows skips 9E F9.
3357 if (category == 0x9E && fillIndex [category] == 0xF9)
3358 IncrementSequentialIndex (ref category);
3361 private void AddCharMapGroupCJK (char c, ref byte category)
3363 AddCharMapCJK (c, ref category);
3365 // LAMESPEC: see below.
3366 if (c == '\u5B78') {
3367 AddCharMapCJK ('\u32AB', ref category);
3368 AddCharMapCJK ('\u323B', ref category);
3370 if (c == '\u52DE') {
3371 AddCharMapCJK ('\u3298', ref category);
3372 AddCharMapCJK ('\u3238', ref category);
3375 AddCharMapCJK ('\u32A2', ref category);
3377 // Especially this mapping order totally does
3378 // not make sense to me.
3379 AddCharMapCJK ('\u32A9', ref category);
3381 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
3384 for (byte weight = 0; weight <= 0x12; weight++) {
3385 object wv = nfkd [weight];
3390 // Special: they are ignored in this area.
3391 // FIXME: check if it is sane
3392 if (0xF900 <= w && w <= 0xFAD9)
3394 // LAMESPEC: on Windows some of CJK characters
3395 // in 3200-32B0 are incorrectly mapped. They
3396 // mix Chinise and Japanese Kanji when
3397 // ordering those characters.
3399 case 0x32A2: case 0x3298: case 0x3238:
3400 case 0x32A9: case 0x323B: case 0x32AB:
3404 AddCharMapCJK ((char) w, ref category);
3408 // For now it is only for 0x7 category.
3409 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
3411 if (map [(int) c].Defined)
3414 bool updateWeight = false;
3415 // Process in advance (lower primary weight)
3416 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3417 if (!map [c2].Defined &&
3418 decompLength [c2] == 1 &&
3419 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3420 switch (decompType [c2]) {
3421 case DecompositionSmall:
3422 updateWeight = true;
3423 AddCharMap ((char) c2, category,
3430 fillIndex [category] = (byte)
3431 (fillIndex [category] + updateCount);
3434 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3435 if (!map [c2].Defined &&
3436 decompLength [c2] == 1 &&
3437 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3438 switch (decompType [c2]) {
3439 case DecompositionSub:
3440 case DecompositionSuper:
3441 case DecompositionWide:
3442 case DecompositionNarrow:
3443 AddCharMap ((char) c2, category,
3451 AddCharMap (c, category, updateCount, level2);
3453 // Since nfkdMap is problematic to have two or more
3454 // NFKD to an identical character, here I iterate all.
3455 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3456 if (!map [c2].Defined &&
3457 decompLength [c2] == 1 &&
3458 (int) (decompValues [decompIndex [c2]]) == (int) c) {
3459 switch (decompType [c2]) {
3460 case DecompositionWide:
3461 case DecompositionNarrow:
3462 case DecompositionSmall:
3463 case DecompositionSub:
3464 case DecompositionSuper:
3467 AddCharMap ((char) c2, category, updateCount, level2);
3474 private void AddArabicCharMap (char c)
3477 byte updateCount = 1;
3481 AddCharMap (c, category, 0, level2);
3483 // Since nfkdMap is problematic to have two or more
3484 // NFKD to an identical character, here I iterate all.
3485 for (int c2 = 0; c2 < char.MaxValue; c2++) {
3486 if (decompLength [c2] == 0)
3488 int idx = decompIndex [c2] + decompLength [c2] - 1;
3489 if ((int) (decompValues [idx]) == (int) c)
3490 AddCharMap ((char) c2, category,
3493 fillIndex [category] += updateCount;
3496 char ToSmallForm (char c)
3498 return ToDecomposed (c, DecompositionSmall, false);
3501 char ToDecomposed (char c, byte d, bool tail)
3503 if (decompType [(int) c] != d)
3505 int idx = decompIndex [(int) c];
3507 idx += decompLength [(int) c] - 1;
3508 return (char) decompValues [idx];
3511 bool ExistsJIS (int cp)
3513 foreach (JISCharacter j in jisJapanese)
3521 #region Level 3 properties (Case/Width)
3523 private byte ComputeLevel3Weight (char c)
3525 byte b = ComputeLevel3WeightRaw (c);
3526 return b > 0 ? (byte) (b + 2) : b;
3529 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3532 if ('\u3192' <= c && c <= '\u319F')
3535 // They have <narrow> NFKD mapping, and on Windows
3536 // those narrow characters are regarded as "normal",
3537 // thus those characters themselves are regarded as
3538 // "wide". grep "<narrow>" and you can pick them up
3539 // (ignoring Kana, Hangul etc.)
3556 if ('\u11A8' <= c && c <= '\u11F9')
3558 if ('\uFFA0' <= c && c <= '\uFFDC')
3560 if ('\u3130' <= c && c <= '\u3164')
3562 if ('\u3165' <= c && c <= '\u318E')
3564 // Georgian Capital letters
3565 if ('\u10A0' <= c && c <= '\u10C5')
3568 if ('\u2776' <= c && c <= '\u277F')
3570 if ('\u2780' <= c && c <= '\u2789')
3572 if ('\u2776' <= c && c <= '\u2793')
3574 if ('\u2160' <= c && c <= '\u216F')
3576 if ('\u2181' <= c && c <= '\u2182')
3579 if ('\u2135' <= c && c <= '\u2138')
3581 byte [] arabicTmp = new byte [] {0x18, 0, 0x8, 0x10};
3582 if ('\uFEB5' <= c && c < '\uFEED' ||
3583 '\uFEF1' <= c && c < '\uFEF5')
3584 return arabicTmp [c % 4];
3585 if ('\uFE80' <= c && c < '\uFF00') {
3586 // 2(Isolated)/8(Final)/0x18(Medial)
3587 switch (decompType [(int) c]) {
3588 case DecompositionIsolated:
3590 case DecompositionFinal:
3592 case DecompositionMedial:
3597 // actually I dunno the reason why they have weights.
3627 switch (decompType [(int) c]) {
3628 case DecompositionWide: // <wide>
3629 case DecompositionSub: // <sub>
3630 case DecompositionSuper: // <super>
3631 ret |= decompType [(int) c];
3634 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3636 if (isUppercase [(int) c]) // DerivedCoreProperties
3646 static bool IsIgnorable (int i)
3648 if (unicodeAge [i] >= 3.1)
3650 switch (char.GetUnicodeCategory ((char) i)) {
3651 case UnicodeCategory.OtherNotAssigned:
3652 case UnicodeCategory.Format:
3659 // FIXME: In the future use DerivedAge.txt to examine character
3660 // versions and set those ones that have higher version than
3661 // 1.0 as ignorable.
3662 static bool IsIgnorable (int i)
3666 // I guess, those characters are added between
3667 // Unicode 1.0 (LCMapString) and Unicode 3.1
3668 // (UnicodeCategory), so they used to be
3669 // something like OtherNotAssigned as of Unicode 1.1.
3670 case 0x2df: case 0x387:
3671 case 0x3d7: case 0x3d8: case 0x3d9:
3672 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3673 case 0x400: case 0x40d: case 0x450: case 0x45d:
3674 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3675 case 0x653: case 0x654: case 0x655: case 0x66d:
3677 case 0x1e9b: case 0x202f: case 0x20ad:
3678 case 0x20ae: case 0x20af:
3679 case 0x20e2: case 0x20e3:
3680 case 0x2139: case 0x213a: case 0x2183:
3681 case 0x2425: case 0x2426: case 0x2619:
3682 case 0x2670: case 0x2671: case 0x3007:
3683 case 0x3190: case 0x3191:
3684 case 0xfffc: case 0xfffd:
3686 // exceptional characters filtered by the
3687 // following conditions. Originally those exceptional
3688 // ranges are incorrect (they should not be ignored)
3689 // and most of those characters are unfortunately in
3691 case 0x4d8: case 0x4d9:
3692 case 0x4e8: case 0x4e9:
3694 case 0x3036: case 0x303f:
3695 case 0x337b: case 0xfb1e:
3700 // The whole Sinhala characters.
3701 0x0D82 <= i && i <= 0x0DF4
3702 // The whole Tibetan characters.
3703 || 0x0F00 <= i && i <= 0x0FD1
3704 // The whole Myanmar characters.
3705 || 0x1000 <= i && i <= 0x1059
3706 // The whole Etiopic, Cherokee,
3707 // Canadian Syllablic, Ogham, Runic,
3708 // Tagalog, Hanunoo, Philippine,
3709 // Buhid, Tagbanwa, Khmer and Mongorian
3711 || 0x1200 <= i && i <= 0x1DFF
3712 // Greek extension characters.
3713 || 0x1F00 <= i && i <= 0x1FFF
3714 // The whole Braille characters.
3715 || 0x2800 <= i && i <= 0x28FF
3716 // CJK radical characters.
3717 || 0x2E80 <= i && i <= 0x2EF3
3718 // Kangxi radical characters.
3719 || 0x2F00 <= i && i <= 0x2FD5
3720 // Ideographic description characters.
3721 || 0x2FF0 <= i && i <= 0x2FFB
3722 // Bopomofo letter and final
3723 || 0x31A0 <= i && i <= 0x31B7
3724 // White square with quadrant characters.
3725 || 0x25F0 <= i && i <= 0x25F7
3726 // Ideographic telegraph symbols.
3727 || 0x32C0 <= i && i <= 0x32CB
3728 || 0x3358 <= i && i <= 0x3370
3729 || 0x33E0 <= i && i <= 0x33FF
3730 // The whole YI characters.
3731 || 0xA000 <= i && i <= 0xA48C
3732 || 0xA490 <= i && i <= 0xA4C6
3733 // American small ligatures
3734 || 0xFB13 <= i && i <= 0xFB17
3735 // hebrew, arabic, variation selector.
3736 || 0xFB1D <= i && i <= 0xFE2F
3737 // Arabic ligatures.
3738 || 0xFEF5 <= i && i <= 0xFEFC
3739 // FIXME: why are they excluded?
3740 || 0x01F6 <= i && i <= 0x01F9
3741 || 0x0218 <= i && i <= 0x0233
3742 || 0x02A9 <= i && i <= 0x02AD
3743 || 0x02EA <= i && i <= 0x02EE
3744 || 0x0349 <= i && i <= 0x036F
3745 || 0x0488 <= i && i <= 0x048F
3746 || 0x04D0 <= i && i <= 0x04FF
3747 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3748 || 0x06D6 <= i && i <= 0x06ED
3749 || 0x06FA <= i && i <= 0x06FE
3750 || 0x2048 <= i && i <= 0x204D
3751 || 0x20e4 <= i && i <= 0x20ea
3752 || 0x213C <= i && i <= 0x214B
3753 || 0x21EB <= i && i <= 0x21FF
3754 || 0x22F2 <= i && i <= 0x22FF
3755 || 0x237B <= i && i <= 0x239A
3756 || 0x239B <= i && i <= 0x23CF
3757 || 0x24EB <= i && i <= 0x24FF
3758 || 0x2596 <= i && i <= 0x259F
3759 || 0x25F8 <= i && i <= 0x25FF
3760 || 0x2672 <= i && i <= 0x2689
3761 || 0x2768 <= i && i <= 0x2775
3762 || 0x27d0 <= i && i <= 0x27ff
3763 || 0x2900 <= i && i <= 0x2aff
3764 || 0x3033 <= i && i <= 0x303F
3765 || 0x31F0 <= i && i <= 0x31FF
3766 || 0x3250 <= i && i <= 0x325F
3767 || 0x32B1 <= i && i <= 0x32BF
3768 || 0x3371 <= i && i <= 0x337B
3769 || 0xFA30 <= i && i <= 0xFA6A
3773 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3775 case UnicodeCategory.PrivateUse:
3776 case UnicodeCategory.Surrogate:
3778 // ignored by nature
3779 case UnicodeCategory.Format:
3780 case UnicodeCategory.OtherNotAssigned:
3787 // To check IsIgnorable sanity, try the driver below under MS.NET.
3790 public static void Main ()
3792 for (int i = 0; i <= char.MaxValue; i++)
3793 Dump (i, IsIgnorable (i));
3796 static void Dump (int i, bool ignore)
3798 switch (Char.GetUnicodeCategory ((char) i)) {
3799 case UnicodeCategory.PrivateUse:
3800 case UnicodeCategory.Surrogate:
3801 return; // check nothing
3805 string s2 = new string ((char) i, 10);
3806 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3807 if ((ret == 0) == ignore)
3809 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3812 #endregion // IsIgnorable
3814 #region IsIgnorableSymbol
3815 static bool IsIgnorableSymbol (int i)
3817 if (IsIgnorable (i))
3822 case 0x00b5: case 0x01C0: case 0x01C1:
3823 case 0x01C2: case 0x01C3: case 0x01F6:
3824 case 0x01F7: case 0x01F8: case 0x01F9:
3825 case 0x02D0: case 0x02EE: case 0x037A:
3826 case 0x03D7: case 0x03F3:
3827 case 0x0400: case 0x040d:
3828 case 0x0450: case 0x045d:
3829 case 0x048C: case 0x048D:
3830 case 0x048E: case 0x048F:
3831 case 0x0587: case 0x0640: case 0x06E5:
3832 case 0x06E6: case 0x06FA: case 0x06FB:
3833 case 0x06FC: case 0x093D: case 0x0950:
3834 case 0x1E9B: case 0x2139: case 0x3006:
3835 case 0x3033: case 0x3034: case 0x3035:
3836 case 0xFE7E: case 0xFE7F:
3838 case 0x16EE: case 0x16EF: case 0x16F0:
3840 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3841 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3842 case 0x3038: // HANGZHOU NUMERAL TEN
3843 case 0x3039: // HANGZHOU NUMERAL TWENTY
3844 case 0x303a: // HANGZHOU NUMERAL THIRTY
3850 case 0x02B9: case 0x02BA: case 0x02C2:
3851 case 0x02C3: case 0x02C4: case 0x02C5:
3852 case 0x02C8: case 0x02CC: case 0x02CD:
3853 case 0x02CE: case 0x02CF: case 0x02D2:
3854 case 0x02D3: case 0x02D4: case 0x02D5:
3855 case 0x02D6: case 0x02D7: case 0x02DE:
3856 case 0x02E5: case 0x02E6: case 0x02E7:
3857 case 0x02E8: case 0x02E9:
3858 case 0x309B: case 0x309C:
3860 case 0x055A: // American Apos
3861 case 0x05C0: // Hebrew Punct
3862 case 0x0E4F: // Thai FONGMAN
3863 case 0x0E5A: // Thai ANGKHANKHU
3864 case 0x0E5B: // Thai KHOMUT
3866 case 0x09F2: // Bengali Rupee Mark
3867 case 0x09F3: // Bengali Rupee Sign
3869 case 0x221e: // INF.
3878 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3880 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3881 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3886 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3888 case UnicodeCategory.Surrogate:
3889 return false; // inconsistent
3891 case UnicodeCategory.SpacingCombiningMark:
3892 case UnicodeCategory.EnclosingMark:
3893 case UnicodeCategory.NonSpacingMark:
3894 case UnicodeCategory.PrivateUse:
3896 if (0x064B <= i && i <= 0x0652) // Arabic
3900 case UnicodeCategory.Format:
3901 case UnicodeCategory.OtherNotAssigned:
3908 // latin in a circle
3909 0x249A <= i && i <= 0x24E9
3910 || 0x2100 <= i && i <= 0x2132
3912 || 0x3196 <= i && i <= 0x31A0
3914 || 0x3200 <= i && i <= 0x321C
3916 || 0x322A <= i && i <= 0x3243
3918 || 0x3260 <= i && i <= 0x32B0
3919 || 0x32D0 <= i && i <= 0x3357
3920 || 0x337B <= i && i <= 0x33DD
3922 use = !Char.IsLetterOrDigit ((char) i);
3926 // This "Digit" rule is mystery.
3927 // It filters some symbols out.
3928 if (Char.IsLetterOrDigit ((char) i))
3930 if (Char.IsNumber ((char) i))
3932 if (Char.IsControl ((char) i)
3933 || Char.IsSeparator ((char) i)
3934 || Char.IsPunctuation ((char) i))
3936 if (Char.IsSymbol ((char) i))
3939 // FIXME: should check more
3944 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3946 public static void Main ()
3948 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3949 for (int i = 0; i <= char.MaxValue; i++) {
3950 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3951 if (uc == UnicodeCategory.Surrogate)
3954 bool ret = IsIgnorableSymbol (i);
3956 string s1 = "TEST ";
3957 string s2 = "TEST " + (char) i;
3959 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3961 if (ret != (result == 0))
3962 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3963 ret ? "should not ignore" :
3972 static bool IsIgnorableNonSpacing (int i)
3974 if (IsIgnorable (i))
3978 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3979 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3980 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3982 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3983 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3984 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3985 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3986 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3987 case 0x0CCD: case 0x0E4E:
3991 if (0x02b9 <= i && i <= 0x02c5
3992 || 0x02cc <= i && i <= 0x02d7
3993 || 0x02e4 <= i && i <= 0x02ef
3994 || 0x20DD <= i && i <= 0x20E0
3998 if (0x064B <= i && i <= 0x00652
3999 || 0x0941 <= i && i <= 0x0948
4000 || 0x0AC1 <= i && i <= 0x0ACD
4001 || 0x0C3E <= i && i <= 0x0C4F
4002 || 0x0E31 <= i && i <= 0x0E3F
4006 return Char.GetUnicodeCategory ((char) i) ==
4007 UnicodeCategory.NonSpacingMark;
4010 // We can reuse IsIgnorableSymbol testcode
4011 // for IsIgnorableNonSpacing.
4017 public byte Category;
4019 public byte Level2; // It is always single byte.
4020 public bool Defined;
4022 public CharMapEntry (byte category, byte level1, byte level2)
4024 Category = category;
4033 public readonly int CP;
4034 public readonly int JIS;
4036 public JISCharacter (int cp, int cpJIS)
4043 class JISComparer : IComparer
4045 public static readonly JISComparer Instance =
4048 public int Compare (object o1, object o2)
4050 JISCharacter j1 = (JISCharacter) o1;
4051 JISCharacter j2 = (JISCharacter) o2;
4052 return j1.JIS - j2.JIS;
4056 class NonJISCharacter
4058 public readonly int CP;
4059 public readonly string Name;
4061 public NonJISCharacter (int cp, string name)
4068 class NonJISComparer : IComparer
4070 public static readonly NonJISComparer Instance =
4071 new NonJISComparer ();
4073 public int Compare (object o1, object o2)
4075 NonJISCharacter j1 = (NonJISCharacter) o1;
4076 NonJISCharacter j2 = (NonJISCharacter) o2;
4077 return string.CompareOrdinal (j1.Name, j2.Name);
4081 class DecimalDictionaryValueComparer : IComparer
4083 public static readonly DecimalDictionaryValueComparer Instance
4084 = new DecimalDictionaryValueComparer ();
4086 private DecimalDictionaryValueComparer ()
4090 public int Compare (object o1, object o2)
4092 DictionaryEntry e1 = (DictionaryEntry) o1;
4093 DictionaryEntry e2 = (DictionaryEntry) o2;
4094 // FIXME: in case of 0, compare decomposition categories
4095 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
4098 int i1 = (int) e1.Key;
4099 int i2 = (int) e2.Key;
4104 class StringDictionaryValueComparer : IComparer
4106 public static readonly StringDictionaryValueComparer Instance
4107 = new StringDictionaryValueComparer ();
4109 private StringDictionaryValueComparer ()
4113 public int Compare (object o1, object o2)
4115 DictionaryEntry e1 = (DictionaryEntry) o1;
4116 DictionaryEntry e2 = (DictionaryEntry) o2;
4117 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
4120 int i1 = (int) e1.Key;
4121 int i2 = (int) e2.Key;
4126 class UCAComparer : IComparer
4128 public static readonly UCAComparer Instance
4129 = new UCAComparer ();
4131 private UCAComparer ()
4135 public int Compare (object o1, object o2)
4137 char i1 = (char) o1;
4138 char i2 = (char) o2;
4140 int l1 = CollationElementTable.GetSortKeyCount (i1);
4141 int l2 = CollationElementTable.GetSortKeyCount (i2);
4142 int l = l1 > l2 ? l2 : l1;
4144 for (int i = 0; i < l; i++) {
4145 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
4146 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
4147 int v = k1.Primary - k2.Primary;
4150 v = k1.Secondary - k2.Secondary;
4153 v = k1.Thirtiary - k2.Thirtiary;
4156 v = k1.Quarternary - k2.Quarternary;
4169 ArrayList items = new ArrayList ();
4171 public Tailoring (int lcid)
4176 public Tailoring (int lcid, int alias)
4183 get { return lcid; }
4187 get { return alias; }
4190 public bool FrenchSort {
4191 get { return frenchSort; }
4192 set { frenchSort = value; }
4195 public void AddDiacriticalMap (byte target, byte replace)
4197 items.Add (new DiacriticalMap (target, replace));
4200 public void AddSortKeyMap (string source, byte [] sortkey)
4202 items.Add (new SortKeyMap (source, sortkey));
4205 public void AddReplacementMap (string source, string replace)
4207 items.Add (new ReplacementMap (source, replace));
4210 public char [] ItemToCharArray ()
4212 ArrayList al = new ArrayList ();
4213 foreach (ITailoringMap m in items)
4214 al.AddRange (m.ToCharArray ());
4215 return al.ToArray (typeof (char)) as char [];
4218 interface ITailoringMap
4220 char [] ToCharArray ();
4223 class DiacriticalMap : ITailoringMap
4225 public readonly byte Target;
4226 public readonly byte Replace;
4228 public DiacriticalMap (byte target, byte replace)
4234 public char [] ToCharArray ()
4236 char [] ret = new char [3];
4237 ret [0] = (char) 02; // kind:DiacriticalMap
4238 ret [1] = (char) Target;
4239 ret [2] = (char) Replace;
4244 class SortKeyMap : ITailoringMap
4246 public readonly string Source;
4247 public readonly byte [] SortKey;
4249 public SortKeyMap (string source, byte [] sortkey)
4255 public char [] ToCharArray ()
4257 char [] ret = new char [Source.Length + 7];
4258 ret [0] = (char) 01; // kind:SortKeyMap
4259 for (int i = 0; i < Source.Length; i++)
4260 ret [i + 1] = Source [i];
4262 for (int i = 0; i < 4; i++)
4263 ret [i + Source.Length + 2] = (char) SortKey [i];
4268 class ReplacementMap : ITailoringMap
4270 public readonly string Source;
4271 public readonly string Replace;
4273 public ReplacementMap (string source, string replace)
4279 public char [] ToCharArray ()
4281 char [] ret = new char [Source.Length + Replace.Length + 3];
4282 ret [0] = (char) 03; // kind:ReplaceMap
4284 for (int i = 0; i < Source.Length; i++)
4285 ret [pos++] = Source [i];
4288 for (int i = 0; i < Replace.Length; i++)
4289 ret [pos++] = Replace [i];