3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
9 // Also, for composite characters it should prepare different index table.
11 // Though it is possible to "compute" level 3 weights, they are still dumped
12 // to an array to avoid execution cost.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, SortKeyBuffer buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume
24 // If there are characters whose primary weight is 0, they are consumed
25 // and considered as a part of the character element.
31 using System.Collections;
32 using System.Globalization;
36 namespace Mono.Globalization.Unicode
38 internal class MSCompatSortKeyTableGenerator
40 public static void Main (string [] args)
42 new MSCompatSortKeyTableGenerator ().Run (args);
45 const int DecompositionWide = 1; // fixed
46 const int DecompositionSub = 2; // fixed
47 const int DecompositionSmall = 3;
48 const int DecompositionIsolated = 4;
49 const int DecompositionInitial = 5;
50 const int DecompositionFinal = 6;
51 const int DecompositionMedial = 7;
52 const int DecompositionNoBreak = 8;
53 const int DecompositionVertical = 9;
54 const int DecompositionFraction = 0xA;
55 const int DecompositionFont = 0xB;
56 const int DecompositionSuper = 0xC; // fixed
57 const int DecompositionFull = 0xE;
58 const int DecompositionNarrow = 0xD;
59 const int DecompositionCircle = 0xF;
60 const int DecompositionSquare = 0x10;
61 const int DecompositionCompat = 0x11;
62 const int DecompositionCanonical = 0x12;
64 TextWriter Result = Console.Out;
66 byte [] fillIndex = new byte [256]; // by category
67 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
69 char [] specialIgnore = new char [] {
70 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
71 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
74 // FIXME: need more love (as always)
75 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
76 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
77 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
78 '\u0292', '\u01BE', '\u0298'};
79 byte [] alphaWeights = new byte [] {
80 2, 9, 0xA, 0x1A, 0x21,
81 0x23, 0x25, 0x2C, 0x32, 0x35,
82 0x36, 0x48, 0x51, 0x70, 0x7C,
83 0x7E, 0x89, 0x8A, 0x91, 0x99,
84 0x9F, 0xA2, 0xA4, 0xA6, 0xA7,
85 0xA9, 0xAA, 0xB3, 0xB4};
87 bool [] isSmallCapital = new bool [char.MaxValue + 1];
88 bool [] isUppercase = new bool [char.MaxValue + 1];
90 byte [] decompType = new byte [char.MaxValue + 1];
91 int [] decompIndex = new int [char.MaxValue + 1];
92 int [] decompLength = new int [char.MaxValue + 1];
94 decimal [] decimalValue = new decimal [char.MaxValue + 1];
96 byte [] diacritical = new byte [char.MaxValue + 1];
98 string [] diacritics = new string [] {
100 "WITH VERTICAL LINE ABOVE;",
101 "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
102 "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
103 " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
104 "WITH OGONEK;", "WITH CEDILLA;",
106 " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
107 " STROKE;", " CIRCUMFLEX AND ACUTE;",
108 " DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
109 " DIAERESIS AND GRAVE;",
111 " CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
112 " MACRON AND ACUTE;",
113 " MACRON AND GRAVE;",
115 " DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
116 " RING ABOVE AND ACUTE",
117 " DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
118 " CIRCUMFLEX AND TILDE",
119 " TILDE AND DIAERESIS",
122 " CEDILLA AND BREVE",
123 " OGONEK AND MACRON",
126 "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
129 " PRECEDED BY APOSTROPHE",
131 " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
134 " RETROFLEX;", "DIAERESIS BELOW",
137 " CIRCUMFLEX BELOW", "HORN AND ACUTE",
138 " BREVE BELOW;", " HORN AND GRAVE",
141 " DOT BELOW AND DOT ABOVE",
142 " RIGHT HALF RING", " HORN AND TILDE",
143 " CIRCUMFLEX AND DOT BELOW",
144 " BREVE AND DOT BELOW",
145 " DOT BELOW AND MACRON",
146 " HORN AND HOOK ABOVE",
148 // CIRCLED, PARENTHESIZED and so on
149 "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
150 "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
151 "PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
153 byte [] diacriticWeights = new byte [] {
156 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
157 0x17, 0x19, 0x1A, 0x1B, 0x1C,
159 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
160 0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
162 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
163 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
165 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
166 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
168 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
169 0x69, 0x69, 0x6A, 0x6D, 0x6E,
171 // CIRCLED, PARENTHESIZED and so on.
172 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
176 int [] numberSecondaryWeightBounds = new int [] {
177 0x660, 0x680, 0x6F0, 0x700, 0x960, 0x970,
178 0x9E0, 0x9F0, 0x9F4, 0xA00, 0xA60, 0xA70,
179 0xAE0, 0xAF0, 0xB60, 0xB70, 0xBE0, 0xC00,
180 0xC60, 0xC70, 0xCE0, 0xCF0, 0xD60, 0xD70,
181 0xE50, 0xE60, 0xED0, 0xEE0
184 char [] orderedCyrillic;
185 char [] orderedGurmukhi;
186 char [] orderedGujarati;
187 char [] orderedGeorgian;
188 char [] orderedThaana;
190 static readonly char [] orderedTamilConsonants = new char [] {
191 // based on traditional Tamil consonants, except for
192 // Grantha (where Microsoft breaks traditionalism).
193 // http://www.angelfire.com/empire/thamizh/padanGaL
194 '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
195 '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
196 '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
197 '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
200 // cp -> character name (only for some characters)
201 ArrayList sortableCharNames = new ArrayList ();
203 // cp -> arrow value (int)
204 ArrayList arrowValues = new ArrayList ();
206 // cp -> box value (int)
207 ArrayList boxValues = new ArrayList ();
209 // cp -> level1 value
210 Hashtable arabicLetterPrimaryValues = new Hashtable ();
211 Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
214 Hashtable arabicNameMap = new Hashtable ();
215 Hashtable cyrillicNameMap = new Hashtable ();
217 // cp -> Hashtable [decompType] -> cp
218 Hashtable nfkdMap = new Hashtable ();
220 // Latin letter -> ArrayList [int]
221 Hashtable latinMap = new Hashtable ();
223 ArrayList jisJapanese = new ArrayList ();
224 ArrayList nonJisJapanese = new ArrayList ();
226 ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
227 ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
228 ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
229 ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
230 byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
232 byte [] ignorableFlags = new byte [char.MaxValue + 1];
234 static double [] unicodeAge = new double [char.MaxValue + 1];
236 ArrayList tailorings = new ArrayList ();
238 void Run (string [] args)
240 string dirname = args.Length == 0 ? "downloaded" : args [0];
241 ParseSources (dirname);
242 Console.Error.WriteLine ("parse done.");
244 ModifyParsedValues ();
246 Console.Error.WriteLine ("generation done.");
248 Console.Error.WriteLine ("serialization done.");
250 StreamWriter sw = new StreamWriter ("agelog.txt");
251 for (int i = 0; i < char.MaxValue; i++) {
252 bool shouldBe = false;
253 switch (Char.GetUnicodeCategory ((char) i)) {
254 case UnicodeCategory.Format: case UnicodeCategory.OtherNotAssigned:
255 shouldBe = true; break;
257 if (unicodeAge [i] >= 3.1)
259 //if (IsIgnorable (i) != shouldBe)
260 sw.WriteLine ("{1} {2} {3} {0:X04} {4} {5}", i, unicodeAge [i], IsIgnorable (i), IsIgnorableSymbol (i), char.GetUnicodeCategory ((char) i), IsIgnorable (i) != shouldBe ? '!' : ' ');
266 byte [] CompressArray (byte [] source, CodePointIndexer i)
268 return (byte []) CodePointIndexer.CompressArray (
269 source, typeof (byte), i);
272 ushort [] CompressArray (ushort [] source, CodePointIndexer i)
274 return (ushort []) CodePointIndexer.CompressArray (
275 source, typeof (ushort), i);
281 SerializeTailorings ();
283 byte [] categories = new byte [map.Length];
284 byte [] level1 = new byte [map.Length];
285 byte [] level2 = new byte [map.Length];
286 byte [] level3 = new byte [map.Length];
287 ushort [] widthCompat = new ushort [map.Length];
288 for (int i = 0; i < map.Length; i++) {
289 categories [i] = map [i].Category;
290 level1 [i] = map [i].Level1;
291 level2 [i] = map [i].Level2;
292 level3 [i] = ComputeLevel3Weight ((char) i);
293 switch (decompType [i]) {
294 case DecompositionNarrow:
295 case DecompositionWide:
296 case DecompositionSuper:
297 case DecompositionSub:
298 // they are always 1 char
299 widthCompat [i] = (ushort) decompValues [decompIndex [i]];
305 ignorableFlags = CompressArray (ignorableFlags,
306 MSCompatUnicodeTableUtil.Ignorable);
307 categories = CompressArray (categories,
308 MSCompatUnicodeTableUtil.Category);
309 level1 = CompressArray (level1,
310 MSCompatUnicodeTableUtil.Level1);
311 level2 = CompressArray (level2,
312 MSCompatUnicodeTableUtil.Level2);
313 level3 = CompressArray (level3,
314 MSCompatUnicodeTableUtil.Level3);
315 widthCompat = (ushort []) CodePointIndexer.CompressArray (
316 widthCompat, typeof (ushort),
317 MSCompatUnicodeTableUtil.WidthCompat);
318 cjkCHS = CompressArray (cjkCHS,
319 MSCompatUnicodeTableUtil.CjkCHS);
320 cjkCHT = CompressArray (cjkCHT,
321 MSCompatUnicodeTableUtil.Cjk);
322 cjkJA = CompressArray (cjkJA,
323 MSCompatUnicodeTableUtil.Cjk);
324 cjkKO = CompressArray (cjkKO,
325 MSCompatUnicodeTableUtil.Cjk);
326 cjkKOlv2 = CompressArray (cjkKOlv2,
327 MSCompatUnicodeTableUtil.Cjk);
330 Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
332 MemoryStream ms = new MemoryStream ();
333 BinaryWriter binary = new BinaryWriter (ms);
334 binary.Write (ignorableFlags.Length);
336 for (int i = 0; i < ignorableFlags.Length; i++) {
337 byte value = ignorableFlags [i];
339 Result.Write ("{0},", value);
341 Result.Write ("0x{0:X02},", value);
343 binary.Write (value);
345 if ((i & 0xF) == 0xF)
346 Result.WriteLine ("// {0:X04}", i - 0xF);
348 Result.WriteLine ("};");
352 Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
354 binary.Write (categories.Length);
356 for (int i = 0; i < categories.Length; i++) {
357 byte value = categories [i];
359 Result.Write ("{0},", value);
361 Result.Write ("0x{0:X02},", value);
363 binary.Write (value);
365 if ((i & 0xF) == 0xF)
366 Result.WriteLine ("// {0:X04}", i - 0xF);
368 Result.WriteLine ("};");
371 // Primary weight value
372 Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
374 binary.Write (level1.Length);
376 for (int i = 0; i < level1.Length; i++) {
377 byte value = level1 [i];
379 Result.Write ("{0},", value);
381 Result.Write ("0x{0:X02},", value);
383 binary.Write (value);
385 if ((i & 0xF) == 0xF)
386 Result.WriteLine ("// {0:X04}", i - 0xF);
388 Result.WriteLine ("};");
392 Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
394 binary.Write (level2.Length);
396 for (int i = 0; i < level2.Length; i++) {
397 byte value = level2 [i];
399 Result.Write ("{0},", value);
401 Result.Write ("0x{0:X02},", value);
403 binary.Write (value);
405 if ((i & 0xF) == 0xF)
406 Result.WriteLine ("// {0:X04}", i - 0xF);
408 Result.WriteLine ("};");
412 Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
414 binary.Write (level3.Length);
416 for (int i = 0; i < level3.Length; i++) {
417 byte value = level3 [i];
419 Result.Write ("{0},", value);
421 Result.Write ("0x{0:X02},", value);
423 binary.Write (value);
425 if ((i & 0xF) == 0xF)
426 Result.WriteLine ("// {0:X04}", i - 0xF);
428 Result.WriteLine ("};");
431 // Width insensitivity mappings
432 // (for now it is more lightweight than dumping the
433 // entire NFKD table).
434 Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
436 binary.Write (widthCompat.Length);
438 for (int i = 0; i < widthCompat.Length; i++) {
439 ushort value = widthCompat [i];
441 Result.Write ("{0},", value);
443 Result.Write ("0x{0:X02},", value);
445 binary.Write (value);
447 if ((i & 0xF) == 0xF)
448 Result.WriteLine ("// {0:X04}", i - 0xF);
450 Result.WriteLine ("};");
453 using (FileStream fs = File.Create ("../collation.core.bin")) {
454 byte [] array = ms.ToArray ();
455 fs.Write (array, 0, array.Length);
460 SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
461 SerializeCJK ("cjkCHT", cjkCHT, 0x9FB0);
462 SerializeCJK ("cjkJA", cjkJA, 0x9FB0);
463 SerializeCJK ("cjkKO", cjkKO, 0x9FB0);
464 SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
467 void SerializeCJK (string name, ushort [] cjk, int max)
469 int offset = 0;//char.MaxValue - cjk.Length;
470 Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
472 MemoryStream ms = new MemoryStream ();
473 BinaryWriter binary = new BinaryWriter (ms);
475 for (int i = 0; i < cjk.Length; i++) {
476 if (i + offset == max)
478 ushort value = cjk [i];
480 Result.Write ("{0},", value);
482 Result.Write ("0x{0:X04},", value);
484 binary.Write (value);
486 if ((i & 0xF) == 0xF)
487 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
489 Result.WriteLine ("};");
492 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
493 byte [] array = ms.ToArray ();
494 fs.Write (array, 0, array.Length);
499 void SerializeCJK (string name, byte [] cjk, int max)
501 int offset = 0;//char.MaxValue - cjk.Length;
502 Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
504 MemoryStream ms = new MemoryStream ();
505 BinaryWriter binary = new BinaryWriter (ms);
507 for (int i = 0; i < cjk.Length; i++) {
508 if (i + offset == max)
510 byte value = cjk [i];
512 Result.Write ("{0},", value);
514 Result.Write ("0x{0:X02},", value);
516 binary.Write (value);
518 if ((i & 0xF) == 0xF)
519 Result.WriteLine ("// {0:X04}", i - 0xF + offset);
521 Result.WriteLine ("};");
524 using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
525 byte [] array = ms.ToArray ();
526 fs.Write (array, 0, array.Length);
531 void SerializeTailorings ()
533 Hashtable indexes = new Hashtable ();
534 Hashtable counts = new Hashtable ();
535 Result.WriteLine ("static char [] tailorings = new char [] {");
538 MemoryStream ms = new MemoryStream ();
539 BinaryWriter binary = new BinaryWriter (ms);
541 foreach (Tailoring t in tailorings) {
544 Result.Write ("/*{0}*/", t.LCID);
545 indexes.Add (t.LCID, count);
546 char [] values = t.ItemToCharArray ();
547 counts.Add (t.LCID, values.Length);
548 foreach (char c in values) {
549 Result.Write ("'\\x{0:X}', ", (int) c);
550 if (++count % 16 == 0)
551 Result.WriteLine (" // {0:X04}", count - 16);
553 binary.Write ((ushort) c);
557 Result.WriteLine ("};");
559 Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
561 byte [] rawdata = ms.ToArray ();
562 ms = new MemoryStream ();
563 binary = new BinaryWriter (ms);
564 binary.Write (tailorings.Count);
566 foreach (Tailoring t in tailorings) {
567 int target = t.Alias != 0 ? t.Alias : t.LCID;
568 if (!indexes.ContainsKey (target)) {
569 throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
572 int idx = (int) indexes [target];
573 int cnt = (int) counts [target];
574 bool french = t.FrenchSort;
576 foreach (Tailoring t2 in tailorings)
577 if (t2.LCID == t.LCID)
578 french = t2.FrenchSort;
579 Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
581 binary.Write (t.LCID);
584 binary.Write (french);
587 Result.WriteLine ("};");
589 binary.Write ((byte) 0xFF);
590 binary.Write ((byte) 0xFF);
591 binary.Write (rawdata.Length / 2);
592 binary.Write (rawdata, 0, rawdata.Length);
595 using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
596 byte [] array = ms.ToArray ();
597 fs.Write (array, 0, array.Length);
604 void ParseSources (string dirname)
607 dirname + "/UnicodeData.txt";
608 string derivedCoreProps =
609 dirname + "/DerivedCoreProperties.txt";
611 dirname + "/Scripts.txt";
613 dirname + "/CP932.TXT";
615 dirname + "/DerivedAge.txt";
616 string chXML = dirname + "/common/collation/zh.xml";
617 string jaXML = dirname + "/common/collation/ja.xml";
618 string koXML = dirname + "/common/collation/ko.xml";
620 ParseDerivedAge (derivedAge);
624 ParseJISOrder (cp932); // in prior to ParseUnidata()
625 ParseUnidata (unidata);
626 ParseDerivedCoreProperties (derivedCoreProps);
627 ParseScripts (scripts);
628 ParseCJK (chXML, jaXML, koXML);
630 ParseTailorings ("mono-tailoring-source.txt");
633 void ParseTailorings (string filename)
637 using (StreamReader sr = new StreamReader (filename)) {
639 while (sr.Peek () >= 0) {
641 ProcessTailoringLine (ref t,
642 sr.ReadLine ().Trim ());
644 } catch (Exception) {
645 Console.Error.WriteLine ("ERROR at line {0}", line);
651 // For now this is enough.
652 string ParseTailoringSourceValue (string s)
654 StringBuilder sb = new StringBuilder ();
655 for (int i = 0; i < s.Length; i++) {
656 if (s.StartsWith ("\\u")) {
657 sb.Append ((char) int.Parse (
658 s.Substring (2, 4), NumberStyles.HexNumber),
665 return sb.ToString ();
668 void ProcessTailoringLine (ref Tailoring t, string s)
670 int idx = s.IndexOf ('#');
672 s = s.Substring (0, idx).Trim ();
673 if (s.Length == 0 || s [0] == '#')
676 idx = s.IndexOf ('=');
679 int.Parse (s.Substring (1, idx - 1)),
680 int.Parse (s.Substring (idx + 1)));
682 t = new Tailoring (int.Parse (s.Substring (1)));
686 if (s.StartsWith ("*FrenchSort")) {
690 string d = "*Diacritical";
691 if (s.StartsWith (d)) {
692 idx = s.IndexOf ("->");
693 t.AddDiacriticalMap (
694 byte.Parse (s.Substring (d.Length, idx - d.Length).Trim (),
695 NumberStyles.HexNumber),
696 byte.Parse (s.Substring (idx + 2).Trim (),
697 NumberStyles.HexNumber));
700 idx = s.IndexOf (':');
702 string source = s.Substring (0, idx).Trim ();
703 string [] l = s.Substring (idx + 1).Trim ().Split (' ');
704 byte [] b = new byte [4];
705 for (int i = 0; i < 4; i++) {
709 b [i] = byte.Parse (l [i],
710 NumberStyles.HexNumber);
712 t.AddSortKeyMap (ParseTailoringSourceValue (source),
715 idx = s.IndexOf ('=');
717 t.AddReplacementMap (
718 ParseTailoringSourceValue (
719 s.Substring (0, idx).Trim ()),
720 ParseTailoringSourceValue (
721 s.Substring (idx + 1).Trim ()));
724 void ParseDerivedAge (string filename)
726 using (StreamReader file =
727 new StreamReader (filename)) {
728 while (file.Peek () >= 0) {
729 string s = file.ReadLine ();
730 int idx = s.IndexOf ('#');
732 s = s.Substring (0, idx);
733 idx = s.IndexOf (';');
737 string cpspec = s.Substring (0, idx);
738 idx = cpspec.IndexOf ("..");
739 NumberStyles nf = NumberStyles.HexNumber |
740 NumberStyles.AllowTrailingWhite;
741 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
742 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
743 string value = s.Substring (cpspec.Length + 1).Trim ();
746 if (cp > char.MaxValue)
749 double v = double.Parse (value);
750 for (int i = cp; i <= cpEnd; i++)
754 unicodeAge [0] = double.MaxValue; // never be supported
757 void ParseUnidata (string filename)
759 ArrayList decompValues = new ArrayList ();
760 using (StreamReader unidata =
761 new StreamReader (filename)) {
762 for (int line = 1; unidata.Peek () >= 0; line++) {
764 ProcessUnidataLine (unidata.ReadLine (), decompValues);
765 } catch (Exception) {
766 Console.Error.WriteLine ("**** At line " + line);
771 this.decompValues = (int [])
772 decompValues.ToArray (typeof (int));
775 void ProcessUnidataLine (string s, ArrayList decompValues)
777 int idx = s.IndexOf ('#');
779 s = s.Substring (0, idx);
780 idx = s.IndexOf (';');
783 int cp = int.Parse (s.Substring (0, idx), NumberStyles.HexNumber);
784 string [] values = s.Substring (idx + 1).Split (';');
787 if (cp > char.MaxValue)
789 if (IsIgnorable (cp))
792 string name = values [0];
794 // SPECIAL CASE: rename some characters for diacritical
795 // remapping. FIXME: why are they different?
796 // FIXME: it's still not working.
797 if (cp == 0x018B || cp == 0x018C)
798 name = name.Replace ("TOPBAR", "STROKE");
801 if (s.IndexOf ("SMALL CAPITAL") > 0)
802 isSmallCapital [cp] = true;
804 // latin mapping by character name
805 if (s.IndexOf ("LATIN") >= 0) {
806 int lidx = s.IndexOf ("LETTER DOTLESS ");
807 int offset = lidx + 15;
809 lidx = s.IndexOf ("LETTER TURNED ");
813 lidx = s.IndexOf ("LETTER ");
816 char c = lidx > 0 ? s [offset] : char.MinValue;
817 char n = s [offset + 1];
818 char target = char.MinValue;
819 if ('A' <= c && c <= 'Z' &&
820 (n == ' ') || n == ';')
822 // FIXME: they are still not working fine.
823 if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
825 if (s.Substring (offset).StartsWith ("ALPHA"))
827 if (s.Substring (offset).StartsWith ("SCHWA"))
829 if (target != char.MinValue) {
830 ArrayList entry = (ArrayList) latinMap [target];
832 entry = new ArrayList ();
833 latinMap [target] = entry;
840 if (0x2000 <= cp && cp < 0x3000) {
842 // SPECIAL CASES. FIXME: why?
844 case 0x21C5: value = -1; break; // E2
845 case 0x261D: value = 1; break;
846 case 0x27A6: value = 3; break;
847 case 0x21B0: value = 7; break;
848 case 0x21B1: value = 3; break;
849 case 0x21B2: value = 7; break;
850 case 0x21B4: value = 5; break;
851 case 0x21B5: value = 7; break;
852 case 0x21B9: value = -1; break; // E1
853 case 0x21CF: value = 7; break;
854 case 0x21D0: value = 3; break;
856 string [] arrowTargets = new string [] {
868 for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
869 if (s.IndexOf (arrowTargets [i]) > 0 &&
870 s.IndexOf ("BARB " + arrowTargets [i]) < 0 &&
871 s.IndexOf (" OVER") < 0
875 arrowValues.Add (new DictionaryEntry (
880 if (0x2500 <= cp && cp < 0x25B0) {
883 // up:1 down:2 right:4 left:8 vert:16 horiz:32
886 // [dr] [dl] [ur] [ul]
890 ArrayList flags = new ArrayList (new int [] {
893 4 + 2, 8 + 2, 4 + 1, 8 + 1,
894 16 + 4, 1 + 2 + 4, 16 + 8, 1 + 2 + 8,
895 32 + 2, 4 + 8 + 2, 32 + 1, 4 + 8 + 1,
896 16 + 32, 1 + 2 + 4 + 8, 4 + 8 + 16, 1 + 2 + 32
898 byte [] offsets = new byte [] {
905 if (s.IndexOf ("BOX DRAWINGS ") > 0) {
907 if (s.IndexOf (" UP") > 0)
909 if (s.IndexOf (" DOWN") > 0)
911 if (s.IndexOf (" RIGHT") > 0)
913 if (s.IndexOf (" LEFT") > 0)
915 if (s.IndexOf (" VERTICAL") > 0)
917 if (s.IndexOf (" HORIZONTAL") > 0)
920 int fidx = flags.IndexOf (flag);
921 value = fidx < 0 ? fidx : offsets [fidx];
922 } else if (s.IndexOf ("BLOCK") > 0) {
923 if (s.IndexOf ("ONE EIGHTH") > 0)
925 else if (s.IndexOf ("ONE QUARTER") > 0)
927 else if (s.IndexOf ("THREE EIGHTHS") > 0)
929 else if (s.IndexOf ("HALF") > 0)
931 else if (s.IndexOf ("FIVE EIGHTHS") > 0)
933 else if (s.IndexOf ("THREE QUARTERS") > 0)
935 else if (s.IndexOf ("SEVEN EIGHTHS") > 0)
941 boxValues.Add (new DictionaryEntry (
945 // For some characters store the name and sort later
946 // to determine sorting.
947 if (0x2100 <= cp && cp <= 0x213F &&
948 Char.IsSymbol ((char) cp))
949 sortableCharNames.Add (
950 new DictionaryEntry (cp, name));
951 else if (0x3380 <= cp && cp <= 0x33DD)
952 sortableCharNames.Add (new DictionaryEntry (
953 cp, name.Substring (7)));
955 // diacritical weights by character name
956 if (diacritics.Length != diacriticWeights.Length)
957 throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
958 for (int d = 0; d < diacritics.Length; d++) {
959 if (s.IndexOf (diacritics [d]) > 0) {
960 diacritical [cp] |= diacriticWeights [d];
963 // also process "COMBINING blah" here
964 // For now it is limited to cp < 0x0370
965 // if (cp < 0x0300 || cp >= 0x0370)
967 string tmp = diacritics [d].TrimEnd (';');
968 if (tmp.IndexOf ("WITH ") == 0)
969 tmp = tmp.Substring (4);
970 tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
972 diacritical [cp] = (byte) (diacriticWeights [d] - 2);
974 //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
976 // Two-step grep required for it.
977 if (s.IndexOf ("FULL STOP") > 0 &&
978 (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
979 diacritical [cp] |= 0xF4;
981 // Cyrillic letter name
982 if (0x0430 <= cp && cp <= 0x0486 &&
983 Char.IsLetter ((char) cp)) {
984 byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
985 // Get primary letter name i.e.
986 // XXX part of CYRILLIC LETTER XXX yyy
987 // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
989 name.Substring (name.IndexOf ("LETTER ") + 7);
990 int tmpIdx = letterName.IndexOf (' ');
991 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
992 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
993 if (cyrillicNameMap.ContainsKey (letterName))
994 value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
996 cyrillicNameMap [letterName] = cp;
998 cyrillicLetterPrimaryValues [cp] = value;
1001 // Arabic letter name
1002 if (0x0621 <= cp && cp <= 0x064A &&
1003 Char.GetUnicodeCategory ((char) cp)
1004 == UnicodeCategory.OtherLetter) {
1005 byte value = (byte) (arabicNameMap.Count * 4 + 0x0B);
1010 // hamza, waw, yeh ... special cases.
1015 value = 0x77; // special cases.
1018 // Get primary letter name i.e.
1019 // XXX part of ARABIC LETTER XXX yyy
1020 // e.g. that of "TEH MARBUTA" is "TEH".
1023 // 0x0640 is special: it does
1024 // not start with ARABIC LETTER
1026 name.Substring (14);
1027 int tmpIdx = letterName.IndexOf (' ');
1028 letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
1029 //Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
1030 if (arabicNameMap.ContainsKey (letterName))
1031 value = (byte) arabicLetterPrimaryValues [arabicNameMap [letterName]];
1033 arabicNameMap [letterName] = cp;
1036 arabicLetterPrimaryValues [cp] = value;
1039 // Japanese square letter
1040 if (0x3300 <= cp && cp <= 0x3357)
1041 if (!ExistsJIS (cp))
1042 nonJisJapanese.Add (new NonJISCharacter (cp, name));
1044 // normalizationType
1045 string decomp = values [4];
1046 idx = decomp.IndexOf ('<');
1048 switch (decomp.Substring (idx + 1, decomp.IndexOf ('>') - 1)) {
1050 decompType [cp] = DecompositionFull;
1053 decompType [cp] = DecompositionSub;
1056 decompType [cp] = DecompositionSuper;
1059 decompType [cp] = DecompositionSmall;
1062 decompType [cp] = DecompositionIsolated;
1065 decompType [cp] = DecompositionInitial;
1068 decompType [cp] = DecompositionFinal;
1071 decompType [cp] = DecompositionMedial;
1074 decompType [cp] = DecompositionNoBreak;
1077 decompType [cp] = DecompositionCompat;
1080 decompType [cp] = DecompositionFraction;
1083 decompType [cp] = DecompositionFont;
1086 decompType [cp] = DecompositionCircle;
1089 decompType [cp] = DecompositionSquare;
1092 decompType [cp] = DecompositionWide;
1095 decompType [cp] = DecompositionNarrow;
1098 decompType [cp] = DecompositionVertical;
1101 throw new Exception ("Support NFKD type : " + decomp);
1105 decompType [cp] = DecompositionCanonical;
1106 decomp = idx < 0 ? decomp : decomp.Substring (decomp.IndexOf ('>') + 2);
1107 if (decomp.Length > 0) {
1109 string [] velems = decomp.Split (' ');
1110 int didx = decompValues.Count;
1111 decompIndex [cp] = didx;
1112 foreach (string v in velems)
1113 decompValues.Add (int.Parse (v, NumberStyles.HexNumber));
1114 decompLength [cp] = velems.Length;
1116 // [decmpType] -> this_cp
1117 int targetCP = (int) decompValues [didx];
1118 // for "(x)" it specially maps to 'x' .
1119 // FIXME: check if it is sane
1120 if (velems.Length == 3 &&
1121 (int) decompValues [didx] == '(' &&
1122 (int) decompValues [didx + 2] == ')')
1123 targetCP = (int) decompValues [didx + 1];
1124 // special: 0x215F "1/"
1125 else if (cp == 0x215F)
1127 else if (velems.Length > 1 &&
1128 (targetCP < 0x4C00 || 0x9FBB < targetCP))
1129 // skip them, except for CJK ideograph compat
1132 if (targetCP != 0) {
1133 Hashtable entry = (Hashtable) nfkdMap [targetCP];
1134 if (entry == null) {
1135 entry = new Hashtable ();
1136 nfkdMap [targetCP] = entry;
1138 entry [(byte) decompType [cp]] = cp;
1142 if (values [5].Length > 0)
1143 decimalValue [cp] = decimal.Parse (values [5]);
1144 else if (values [6].Length > 0)
1145 decimalValue [cp] = decimal.Parse (values [6]);
1146 else if (values [7].Length > 0) {
1147 string decstr = values [7];
1148 idx = decstr.IndexOf ('/');
1149 if (cp == 0x215F) // special. "1/"
1150 decimalValue [cp] = 0x1;
1154 decimal.Parse (decstr.Substring (0, idx))
1155 / decimal.Parse (decstr.Substring (idx + 1));
1156 else if (decstr [0] == '(' &&
1157 decstr [decstr.Length - 1] == ')')
1160 decimal.Parse (decstr.Substring (1, decstr.Length - 2));
1161 else if (decstr [decstr.Length - 1] == '.')
1164 decimal.Parse (decstr.Substring (0, decstr.Length - 1));
1166 decimalValue [cp] = decimal.Parse (decstr);
1170 void ParseDerivedCoreProperties (string filename)
1173 using (StreamReader file =
1174 new StreamReader (filename)) {
1175 for (int line = 1; file.Peek () >= 0; line++) {
1177 ProcessDerivedCorePropLine (file.ReadLine ());
1178 } catch (Exception) {
1179 Console.Error.WriteLine ("**** At line " + line);
1186 void ProcessDerivedCorePropLine (string s)
1188 int idx = s.IndexOf ('#');
1190 s = s.Substring (0, idx);
1191 idx = s.IndexOf (';');
1194 string cpspec = s.Substring (0, idx);
1195 idx = cpspec.IndexOf ("..");
1196 NumberStyles nf = NumberStyles.HexNumber |
1197 NumberStyles.AllowTrailingWhite;
1198 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1199 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1200 string value = s.Substring (cpspec.Length + 1).Trim ();
1203 if (cp > char.MaxValue)
1208 for (int x = cp; x <= cpEnd; x++)
1209 isUppercase [x] = true;
1214 void ParseScripts (string filename)
1216 ArrayList cyrillic = new ArrayList ();
1217 ArrayList gurmukhi = new ArrayList ();
1218 ArrayList gujarati = new ArrayList ();
1219 ArrayList georgian = new ArrayList ();
1220 ArrayList thaana = new ArrayList ();
1222 using (StreamReader file =
1223 new StreamReader (filename)) {
1224 while (file.Peek () >= 0) {
1225 string s = file.ReadLine ();
1226 int idx = s.IndexOf ('#');
1228 s = s.Substring (0, idx);
1229 idx = s.IndexOf (';');
1233 string cpspec = s.Substring (0, idx);
1234 idx = cpspec.IndexOf ("..");
1235 NumberStyles nf = NumberStyles.HexNumber |
1236 NumberStyles.AllowTrailingWhite;
1237 int cp = int.Parse (idx < 0 ? cpspec : cpspec.Substring (0, idx), nf);
1238 int cpEnd = idx < 0 ? cp : int.Parse (cpspec.Substring (idx + 2), nf);
1239 string value = s.Substring (cpspec.Length + 1).Trim ();
1242 if (cp > char.MaxValue)
1247 for (int x = cp; x <= cpEnd; x++)
1248 if (!IsIgnorable (x))
1249 cyrillic.Add ((char) x);
1252 for (int x = cp; x <= cpEnd; x++)
1253 if (!IsIgnorable (x))
1254 gurmukhi.Add ((char) x);
1257 for (int x = cp; x <= cpEnd; x++)
1258 if (!IsIgnorable (x))
1259 gujarati.Add ((char) x);
1262 for (int x = cp; x <= cpEnd; x++)
1263 if (!IsIgnorable (x))
1264 georgian.Add ((char) x);
1267 for (int x = cp; x <= cpEnd; x++)
1268 if (!IsIgnorable (x))
1269 thaana.Add ((char) x);
1274 cyrillic.Sort (UCAComparer.Instance);
1275 gurmukhi.Sort (UCAComparer.Instance);
1276 gujarati.Sort (UCAComparer.Instance);
1277 georgian.Sort (UCAComparer.Instance);
1278 thaana.Sort (UCAComparer.Instance);
1279 orderedCyrillic = (char []) cyrillic.ToArray (typeof (char));
1280 orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char));
1281 orderedGujarati = (char []) gujarati.ToArray (typeof (char));
1282 orderedGeorgian = (char []) georgian.ToArray (typeof (char));
1283 orderedThaana = (char []) thaana.ToArray (typeof (char));
1286 void ParseJISOrder (string filename)
1290 using (StreamReader file =
1291 new StreamReader (filename)) {
1292 for (;file.Peek () >= 0; line++)
1293 ProcessJISOrderLine (file.ReadLine ());
1295 } catch (Exception) {
1296 Console.Error.WriteLine ("---- line {0}", line);
1301 char [] ws = new char [] {'\t', ' '};
1303 void ProcessJISOrderLine (string s)
1305 int idx = s.IndexOf ('#');
1307 s = s.Substring (0, idx).Trim ();
1310 idx = s.IndexOfAny (ws);
1313 // They start with "0x" so cut them out.
1314 int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
1315 int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
1316 jisJapanese.Add (new JISCharacter (cp, jis));
1319 void ParseCJK (string zhXML, string jaXML, string koXML)
1321 XmlDocument doc = new XmlDocument ();
1322 doc.XmlResolver = null;
1329 // Chinese Simplified
1332 offset = 0;//char.MaxValue - arr.Length;
1334 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
1336 foreach (char c in s) {
1338 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1340 arr [(int) c - offset] = (ushort) v++;
1346 // Chinese Traditional
1349 offset = 0;//char.MaxValue - arr.Length;
1350 s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
1352 foreach (char c in s) {
1354 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1356 arr [(int) c - offset] = (ushort) v++;
1365 offset = 0;//char.MaxValue - arr.Length;
1367 s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
1369 foreach (char c in s) {
1371 Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v);
1373 arr [(int) c - offset] = (ushort) v++;
1380 // Korean weight is somewhat complex. It first shifts
1381 // Hangul category from 52-x to 80-x (they are anyways
1382 // computed). CJK ideographs are placed at secondary
1383 // weight, like XX YY 01 zz 01, where XX and YY are
1384 // corresponding "reset" value and zz is 41,43,45...
1386 // Unlike chs,cht and ja, Korean value is a combined
1387 // ushort which is computed as category
1391 offset = 0;//char.MaxValue - arr.Length;
1393 foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
1394 XmlElement sc = (XmlElement) reset.NextSibling;
1395 // compute "category" and "level 1" for the
1396 // target "reset" Hangle syllable
1397 char rc = reset.InnerText [0];
1398 int ri = ((int) rc - 0xAC00) + 1;
1400 ((ri / 254) * 256 + (ri % 254) + 2);
1401 // Place the characters after the target.
1404 foreach (char c in s) {
1405 arr [(int) c - offset] = p;
1406 cjkKOlv2 [(int) c - offset] = (byte) v;
1416 void FillIgnorables ()
1418 for (int i = 0; i <= char.MaxValue; i++) {
1419 if (Char.GetUnicodeCategory ((char) i) ==
1420 UnicodeCategory.OtherNotAssigned)
1422 if (IsIgnorable (i))
1423 ignorableFlags [i] |= 1;
1424 if (IsIgnorableSymbol (i))
1425 ignorableFlags [i] |= 2;
1426 if (IsIgnorableNonSpacing (i))
1427 ignorableFlags [i] |= 4;
1431 void ModifyParsedValues ()
1433 // number, secondary weights
1435 int [] numarr = numberSecondaryWeightBounds;
1436 for (int i = 0; i < numarr.Length; i += 2, weight++)
1437 for (int cp = numarr [i]; cp < numarr [i + 1]; cp++)
1438 if (Char.IsNumber ((char) cp))
1439 diacritical [cp] = weight;
1441 // Modify some decomposition equivalence
1442 decompType [0xFE31] = 0;
1443 decompIndex [0xFE31] = 0;
1444 decompLength [0xFE31] = 0;
1445 decompType [0xFE32] = 0;
1446 decompIndex [0xFE32] = 0;
1447 decompLength [0xFE32] = 0;
1449 // Korean parens numbers
1450 for (int i = 0x3200; i <= 0x321C; i++)
1451 diacritical [i] = 0xA;
1452 for (int i = 0x3260; i <= 0x327B; i++)
1453 diacritical [i] = 0xC;
1455 // Update name part of named characters
1456 for (int i = 0; i < sortableCharNames.Count; i++) {
1457 DictionaryEntry de =
1458 (DictionaryEntry) sortableCharNames [i];
1459 int cp = (int) de.Key;
1460 string renamed = null;
1462 case 0x2101: renamed = "A_1"; break;
1463 case 0x33C3: renamed = "A_2"; break;
1464 case 0x2105: renamed = "C_1"; break;
1465 case 0x2106: renamed = "C_2"; break;
1466 case 0x211E: renamed = "R1"; break;
1467 case 0x211F: renamed = "R2"; break;
1468 // Remove some of them!
1479 sortableCharNames.RemoveAt (i);
1483 if (renamed != null)
1484 sortableCharNames [i] =
1485 new DictionaryEntry (cp, renamed);
1489 void GenerateCore ()
1493 #region Specially ignored // 01
1494 // This will raise "Defined" flag up.
1495 foreach (char c in specialIgnore)
1496 map [(int) c] = new CharMapEntry (0, 0, 0);
1500 #region Variable weights
1501 // Controls : 06 03 - 06 3D
1503 for (int i = 0; i < 65536; i++) {
1504 if (IsIgnorable (i))
1507 uc = Char.GetUnicodeCategory (c);
1508 // NEL is whitespace but not ignored here.
1509 if (uc == UnicodeCategory.Control &&
1510 !Char.IsWhiteSpace (c) || c == '\u0085')
1511 AddCharMap (c, 6, 1);
1515 fillIndex [6] = 0x80;
1516 AddCharMapGroup ('\'', 6, 1, 0);
1517 AddCharMap ('\uFE63', 6, 1);
1519 // Hyphen/Dash : 06 81 - 06 90
1520 for (int i = 0; i < char.MaxValue; i++) {
1521 if (!IsIgnorable (i) &&
1522 Char.GetUnicodeCategory ((char) i) ==
1523 UnicodeCategory.DashPunctuation) {
1524 AddCharMapGroup2 ((char) i, 6, 1, 0);
1526 // SPECIAL: add 2027 and 2043
1527 // Maybe they are regarded the
1528 // same hyphens in "central"
1530 AddCharMap ('\u2027', 6, 1);
1531 AddCharMap ('\u2043', 6, 1);
1536 // Arabic variable weight chars 06 A0 -
1537 fillIndex [6] = 0xA0;
1539 for (int i = 0x64B; i <= 0x650; i++)
1540 AddArabicCharMap ((char) i);
1542 AddCharMapGroup ('\u0652', 6, 1, 0);
1544 AddCharMapGroup ('\u0651', 6, 1, 0);
1548 #region Nonspacing marks // 01
1549 // FIXME: 01 03 - 01 B6 ... annoyance :(
1551 // Combining diacritical marks: 01 DC -
1553 fillIndex [0x1] = 0x41;
1554 for (int i = 0x030E; i <= 0x0326; i++)
1555 if (!IsIgnorable (i))
1556 AddCharMap ((char) i, 0x1, 1);
1557 for (int i = 0x0329; i <= 0x0334; i++)
1558 if (!IsIgnorable (i))
1559 AddCharMap ((char) i, 0x1, 1);
1560 for (int i = 0x0339; i <= 0x0341; i++)
1561 if (!IsIgnorable (i))
1562 AddCharMap ((char) i, 0x1, 1);
1563 fillIndex [0x1] = 0x72;
1564 for (int i = 0x0346; i <= 0x0348; i++)
1565 if (!IsIgnorable (i))
1566 AddCharMap ((char) i, 0x1, 1);
1567 for (int i = 0x02BE; i <= 0x02BF; i++)
1568 if (!IsIgnorable (i))
1569 AddCharMap ((char) i, 0x1, 1);
1570 for (int i = 0x02C1; i <= 0x02C5; i++)
1571 if (!IsIgnorable (i))
1572 AddCharMap ((char) i, 0x1, 1);
1573 for (int i = 0x02CE; i <= 0x02CF; i++)
1574 if (!IsIgnorable (i))
1575 AddCharMap ((char) i, 0x1, 1);
1576 for (int i = 0x02D1; i <= 0x02D3; i++)
1577 if (!IsIgnorable (i))
1578 AddCharMap ((char) i, 0x1, 1);
1579 AddCharMap ('\u02DE', 0x1, 1);
1580 for (int i = 0x02E4; i <= 0x02E9; i++)
1581 if (!IsIgnorable (i))
1582 AddCharMap ((char) i, 0x1, 1);
1584 // FIXME: needs more love here (it should eliminate
1585 // all the hacky code above).
1586 for (int i = 0x0300; i < 0x0370; i++)
1587 if (!IsIgnorable (i) && diacritical [i] != 0
1588 /* especiall here*/ && !map [i].Defined)
1589 map [i] = new CharMapEntry (
1590 0x1, 0x1, diacritical [i]);
1592 // LAMESPEC: It should not stop at '\u20E1'. There are
1593 // a few more characters (that however results in
1594 // overflow of level 2 unless we start before 0xDD).
1595 fillIndex [0x1] = 0xDC;
1596 for (int i = 0x20d0; i <= 0x20e1; i++)
1597 AddCharMap ((char) i, 0x1, 1);
1601 #region Whitespaces // 07 03 -
1602 fillIndex [0x7] = 0x2;
1603 AddCharMap (' ', 0x7, 2);
1604 AddCharMap ('\u00A0', 0x7, 1);
1605 for (int i = 9; i <= 0xD; i++)
1606 AddCharMap ((char) i, 0x7, 1);
1607 for (int i = 0x2000; i <= 0x200B; i++)
1608 AddCharMap ((char) i, 0x7, 1);
1610 fillIndex [0x7] = 0x17;
1611 AddCharMapGroup ('\u2028', 0x7, 1, 0);
1612 AddCharMapGroup ('\u2029', 0x7, 1, 0);
1614 // Characters which used to represent layout control.
1615 // LAMESPEC: Windows developers seem to have thought
1616 // that those characters are kind of whitespaces,
1617 // while they aren't.
1618 AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
1619 AddCharMap ('\u2423', 0x7, 1, 0); // open box
1622 // category 09 - continued symbols from 08
1623 fillIndex [0x9] = 2;
1625 for (int cp = 0x2300; cp <= 0x237A; cp++)
1626 AddCharMap ((char) cp, 0x9, 1, 0);
1629 byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
1630 foreach (DictionaryEntry de in arrowValues) {
1631 int idx = (int) de.Value;
1632 int cp = (int) de.Key;
1633 if (map [cp].Defined)
1635 fillIndex [0x9] = (byte) (0xD8 + idx);
1636 AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]);
1640 byte [] boxLv2 = new byte [128];
1641 for (int i = 0; i < boxLv2.Length; i++)
1643 foreach (DictionaryEntry de in boxValues) {
1644 int cp = (int) de.Key;
1645 int idx = (int) de.Value;
1646 if (map [cp].Defined)
1648 fillIndex [0x9] = (byte) (0xE5 + idx);
1649 AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [idx]);
1652 // Some special characters (slanted)
1653 fillIndex [0x9] = 0xF4;
1654 AddCharMap ('\u2571', 0x9, 3);
1655 AddCharMap ('\u2572', 0x9, 3);
1656 AddCharMap ('\u2573', 0x9, 3);
1658 // FIXME: implement 0A
1660 fillIndex [0xA] = 2;
1661 // byte currency symbols
1662 for (int cp = 0; cp < 0x100; cp++) {
1663 uc = Char.GetUnicodeCategory ((char) cp);
1664 if (!IsIgnorable (cp) &&
1665 uc == UnicodeCategory.CurrencySymbol &&
1667 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1669 // byte other symbols
1670 for (int cp = 0; cp < 0x100; cp++) {
1672 continue; // SPECIAL: skip FIXME: why?
1673 uc = Char.GetUnicodeCategory ((char) cp);
1674 if (!IsIgnorable (cp) &&
1675 uc == UnicodeCategory.OtherSymbol)
1676 AddCharMapGroup ((char) cp, 0xA, 1, 0);
1679 fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
1680 for (int cp = 0x2600; cp <= 0x2613; cp++)
1681 AddCharMap ((char) cp, 0xA, 1, 0);
1683 for (int cp = 0x2620; cp <= 0x2770; cp++)
1684 if (Char.IsSymbol ((char) cp))
1685 AddCharMap ((char) cp, 0xA, 1, 0);
1687 for (int i = 0x2440; i < 0x2460; i++)
1688 AddCharMap ((char) i, 0xA, 1, 0);
1692 #region Numbers // 0C 02 - 0C E1
1693 fillIndex [0xC] = 2;
1695 // 9F8 : Bengali "one less than the denominator"
1696 AddCharMap ('\u09F8', 0xC, 1);
1698 ArrayList numbers = new ArrayList ();
1699 for (int i = 0; i < 65536; i++)
1700 if (!IsIgnorable (i) &&
1701 Char.IsNumber ((char) i) &&
1702 (i < 0x3190 || 0x32C0 < i)) // they are CJK characters
1705 ArrayList numberValues = new ArrayList ();
1706 foreach (int i in numbers)
1707 numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
1708 numberValues.Sort (DecimalDictionaryValueComparer.Instance);
1710 //foreach (DictionaryEntry de in numberValues)
1711 //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
1713 decimal prevValue = -1;
1714 foreach (DictionaryEntry de in numberValues) {
1715 int cp = (int) de.Key;
1716 decimal currValue = (decimal) de.Value;
1717 bool addnew = false;
1718 if (prevValue < currValue &&
1719 prevValue - (int) prevValue == 0 &&
1723 // Process Hangzhou and Roman numbers
1725 // There are some SPECIAL cases.
1726 if (currValue != 4) // no increment for 4
1730 if (currValue <= 10) {
1731 xcp = (int) prevValue + 0x2170 - 1;
1732 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1733 xcp = (int) prevValue + 0x2160 - 1;
1734 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1735 fillIndex [0xC] += 2;
1736 xcp = (int) prevValue + 0x3021 - 1;
1737 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1740 else if (currValue == 11)
1743 if (prevValue < currValue)
1744 prevValue = currValue;
1745 if (map [cp].Defined)
1747 // HangZhou and Roman are add later
1749 else if (0x3021 <= cp && cp < 0x302A
1750 || 0x2160 <= cp && cp < 0x216A
1751 || 0x2170 <= cp && cp < 0x217A)
1754 if (cp == 0x215B) // FIXME: why?
1755 fillIndex [0xC] += 2;
1756 else if (cp == 0x3021) // FIXME: why?
1758 AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
1759 if (addnew || cp <= '9') {
1760 int mod = (int) currValue - 1;
1762 if (1 <= currValue && currValue <= 10) {
1764 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1766 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1768 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1770 if (1 <= currValue && currValue <= 20) {
1772 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1774 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1776 AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
1780 if (cp != 0x09E7 && cp != 0x09EA)
1783 // Add special cases that are not regarded as
1784 // numbers in UnicodeCategory speak.
1787 AddCharMapGroup ('\u01BD', 0xC, 0, 0);
1788 AddCharMapGroup ('\u01BC', 0xC, 1, 0);
1790 else if (cp == '6') // FIXME: why?
1795 fillIndex [0xC] = 0xFF;
1796 AddCharMap ('\u221E', 0xC, 1);
1799 #region Letters and NonSpacing Marks (general)
1801 // ASCII Latin alphabets
1802 for (int i = 0; i < alphabets.Length; i++)
1803 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
1806 // non-ASCII Latin alphabets
1807 // FIXME: there is no such characters that are placed
1808 // *after* "alphabets" array items. This is nothing
1809 // more than a hack that creates dummy weight for
1810 // primary characters.
1811 for (int i = 0x0080; i < 0x0300; i++) {
1812 if (!Char.IsLetter ((char) i))
1814 // For those Latin Letters which has NFKD are
1815 // not added as independent primary character.
1816 if (decompIndex [i] != 0)
1819 // 1.some alphabets have primarily
1820 // equivalent ASCII alphabets.
1821 // 2.some have independent primary weights,
1822 // but inside a-to-z range.
1823 // 3.there are some expanded characters that
1824 // are not part of Unicode Standard NFKD.
1825 // 4. some characters are letter in IsLetter
1826 // but not in sortkeys (maybe unicode version
1827 // difference caused it).
1829 // 1. skipping them does not make sense
1830 // case 0xD0: case 0xF0: case 0x131: case 0x138:
1831 // case 0x184: case 0x185: case 0x186: case 0x189:
1832 // case 0x18D: case 0x18E: case 0x18F: case 0x190:
1833 // case 0x194: case 0x195: case 0x196: case 0x19A:
1834 // case 0x19B: case 0x19C:
1835 // 2. skipping them does not make sense
1836 // case 0x14A: // Ng
1837 // case 0x14B: // ng
1841 case 0xDE: // Icelandic Thorn
1842 case 0xFE: // Icelandic Thorn
1843 case 0xDF: // German ss
1844 case 0xFF: // German ss
1846 case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
1847 // not classified yet
1848 // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
1849 // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
1850 // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
1854 AddCharMapGroup ((char) i, 0xE, 1, 0);
1858 fillIndex [0xF] = 02;
1859 for (int i = 0x0380; i < 0x0390; i++)
1860 if (Char.IsLetter ((char) i))
1861 AddLetterMap ((char) i, 0xF, 1);
1862 fillIndex [0xF] = 02;
1863 for (int i = 0x0391; i < 0x03CF; i++)
1864 if (Char.IsLetter ((char) i))
1865 AddLetterMap ((char) i, 0xF, 1);
1866 fillIndex [0xF] = 0x40;
1867 for (int i = 0x03D0; i < 0x0400; i++)
1868 if (Char.IsLetter ((char) i))
1869 AddLetterMap ((char) i, 0xF, 1);
1871 // Cyrillic - character name order
1872 fillIndex [0x10] = 0x6;
1874 for (int i = 0; i < orderedCyrillic.Length; i++)
1875 Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
1877 // table which is moslty from UCA DUCET.
1878 for (int i = 0; i < orderedCyrillic.Length; i++) {
1879 char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
1880 if (!IsIgnorable ((int) c) &&
1882 Char.IsLetter (c)) {
1883 AddLetterMap (c, 0x10, 0);
1884 fillIndex [0x10] += 3;
1888 for (int i = 0x0460; i < 0x0481; i++) {
1889 if (Char.IsLetter ((char) i)) {
1890 AddLetterMap ((char) i, 0x10, 0);
1891 fillIndex [0x10] += 3;
1896 for (int i = 0x0400; i <= 0x0486; i++) {
1897 if (!Char.IsLetter ((char) i)) {
1898 // AddCharMap ((char) i, 0x1, 1);
1901 if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
1902 Console.Error.WriteLine ("no value for {0:x04}", i);
1906 (byte) cyrillicLetterPrimaryValues [i];
1907 AddLetterMap ((char) i, 0x10, 0);
1912 fillIndex [0x11] = 0x3;
1913 for (int i = 0x0531; i < 0x0586; i++)
1914 if (Char.IsLetter ((char) i))
1915 AddLetterMap ((char) i, 0x11, 1);
1919 fillIndex [0x12] = 0x3;
1920 for (int i = 0x05D0; i < 0x05FF; i++)
1921 if (Char.IsLetter ((char) i))
1922 AddLetterMap ((char) i, 0x12, 1);
1924 fillIndex [0x1] = 0x3;
1925 for (int i = 0x0591; i <= 0x05C2; i++)
1927 AddCharMap ((char) i, 0x1, 1);
1930 fillIndex [0x1] = 0x8E;
1931 fillIndex [0x13] = 0x3;
1932 for (int i = 0x0621; i <= 0x064A; i++) {
1934 if (Char.GetUnicodeCategory ((char) i)
1935 != UnicodeCategory.OtherLetter) {
1936 // FIXME: arabic nonspacing marks are
1937 // in different order.
1938 AddCharMap ((char) i, 0x1, 1);
1941 // map [i] = new CharMapEntry (0x13,
1942 // (byte) arabicLetterPrimaryValues [i], 1);
1944 (byte) arabicLetterPrimaryValues [i];
1945 AddLetterMap ((char) i, 0x13, 0);
1947 fillIndex [0x13] = 0x84;
1948 for (int i = 0x0674; i < 0x06D6; i++)
1949 if (Char.IsLetter ((char) i))
1950 AddLetterMap ((char) i, 0x13, 1);
1953 // FIXME: it does seem straight codepoint mapping.
1954 fillIndex [0x14] = 04;
1955 for (int i = 0x0901; i < 0x0905; i++)
1956 if (!IsIgnorable (i))
1957 AddLetterMap ((char) i, 0x14, 2);
1958 fillIndex [0x14] = 0xB;
1959 for (int i = 0x0905; i < 0x093A; i++) {
1961 AddCharMap ('\u0929', 0x14, 0, 8);
1963 AddCharMap ('\u0931', 0x14, 0, 8);
1965 AddCharMap ('\u0934', 0x14, 0, 8);
1966 if (Char.IsLetter ((char) i))
1967 AddLetterMap ((char) i, 0x14, 4);
1969 AddCharMap ('\u0960', 0x14, 4);
1971 AddCharMap ('\u0961', 0x14, 4);
1973 fillIndex [0x14] = 0xDA;
1974 for (int i = 0x093E; i < 0x0945; i++)
1975 if (!IsIgnorable (i))
1976 AddLetterMap ((char) i, 0x14, 2);
1977 fillIndex [0x14] = 0xEC;
1978 for (int i = 0x0945; i < 0x094F; i++)
1979 if (!IsIgnorable (i))
1980 AddLetterMap ((char) i, 0x14, 2);
1984 fillIndex [0x15] = 02;
1985 for (int i = 0x0980; i < 0x9FF; i++) {
1986 if (IsIgnorable (i))
1989 fillIndex [0x15] = 0x3B;
1990 switch (Char.GetUnicodeCategory ((char) i)) {
1991 case UnicodeCategory.NonSpacingMark:
1992 case UnicodeCategory.DecimalDigitNumber:
1993 case UnicodeCategory.OtherNumber:
1996 AddLetterMap ((char) i, 0x15, 1);
1999 fillIndex [0x1] = 0x3;
2000 for (int i = 0x0981; i < 0x0A00; i++)
2001 if (Char.GetUnicodeCategory ((char) i) ==
2002 UnicodeCategory.NonSpacingMark)
2003 AddCharMap ((char) i, 0x1, 1);
2005 // Gurmukhi. orderedGurmukhi is from UCA
2006 // FIXME: it does not look equivalent to UCA.
2007 fillIndex [0x16] = 04;
2008 fillIndex [0x1] = 3;
2009 for (int i = 0; i < orderedGurmukhi.Length; i++) {
2010 char c = orderedGurmukhi [i];
2011 if (IsIgnorable ((int) c))
2013 if (IsIgnorableNonSpacing (c)) {
2014 AddLetterMap (c, 0x1, 1);
2017 if (c == '\u0A3C' || c == '\u0A4D' ||
2018 '\u0A66' <= c && c <= '\u0A71')
2020 // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
2022 if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
2024 AddLetterMap (c, 0x16, shift);
2027 // Gujarati. orderedGujarati is from UCA
2028 fillIndex [0x17] = 0x4;
2030 map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
2031 map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
2032 map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
2033 map [0x0A71] = new CharMapEntry (1, 0, 0x6);
2034 map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
2035 map [0x0A70] = new CharMapEntry (1, 0, 0xE);
2036 // letters go first.
2037 for (int i = 0; i < orderedGujarati.Length; i++) {
2039 char c = orderedGujarati [i];
2040 if (Char.IsLetter (c)) {
2042 if (c == '\u0AB3' || c == '\u0A32')
2044 if (c == '\u0A33') {
2045 AddCharMap ('\u0A32', 0x17, 0);
2046 AddCharMap ('\u0A33', 0x17, 4, 4);
2050 AddCharMap ('\u0AE0', 0x17, 0, 5);
2051 AddCharMap (c, 0x17, 4);
2054 AddCharMap ('\u0AB3', 0x17, 6);
2058 byte gujaratiShift = 4;
2059 fillIndex [0x17] = 0xC0;
2060 for (int i = 0; i < orderedGujarati.Length; i++) {
2061 char c = orderedGujarati [i];
2062 if (fillIndex [0x17] == 0xCC)
2064 if (!Char.IsLetter (c)) {
2067 AddCharMap ('\u0A81', 0x17, 2);
2070 AddLetterMap (c, 0x17, gujaratiShift);
2075 fillIndex [0x1] = 03;
2076 fillIndex [0x18] = 02;
2077 for (int i = 0x0B00; i < 0x0B7F; i++) {
2078 switch (Char.GetUnicodeCategory ((char) i)) {
2079 case UnicodeCategory.NonSpacingMark:
2080 case UnicodeCategory.DecimalDigitNumber:
2081 AddLetterMap ((char) i, 0x1, 1);
2084 AddLetterMap ((char) i, 0x18, 1);
2088 fillIndex [0x19] = 2;
2089 AddCharMap ('\u0BD7', 0x19, 0);
2090 fillIndex [0x19] = 0xA;
2092 for (int i = 0x0B82; i <= 0x0B94; i++)
2093 if (!IsIgnorable ((char) i))
2094 AddCharMap ((char) i, 0x19, 2);
2096 fillIndex [0x19] = 0x28;
2097 // The array for Tamil consonants is a constant.
2098 // Windows have almost similar sequence to TAM from
2099 // tamilnet but a bit different in Grantha.
2100 for (int i = 0; i < orderedTamilConsonants.Length; i++)
2101 AddLetterMap (orderedTamilConsonants [i], 0x19, 4);
2103 fillIndex [0x19] = 0x82;
2104 for (int i = 0x0BBE; i < 0x0BCD; i++)
2105 if (Char.GetUnicodeCategory ((char) i) ==
2106 UnicodeCategory.SpacingCombiningMark
2108 AddLetterMap ((char) i, 0x19, 2);
2111 fillIndex [0x1A] = 0x4;
2112 for (int i = 0x0C00; i < 0x0C62; i++) {
2113 if (i == 0x0C55 || i == 0x0C56)
2115 AddCharMap ((char) i, 0x1A, 3);
2116 char supp = (i == 0x0C0B) ? '\u0C60':
2117 i == 0x0C0C ? '\u0C61' : char.MinValue;
2118 if (supp == char.MinValue)
2120 AddCharMap (supp, 0x1A, 3);
2124 fillIndex [0x1B] = 4;
2125 for (int i = 0x0C80; i < 0x0CE5; i++) {
2126 if (i == 0x0CD5 || i == 0x0CD6)
2128 if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
2129 continue; // shift after 0xCB9
2130 AddCharMap ((char) i, 0x1B, 3);
2132 // SPECIAL CASES: but why?
2133 AddCharMap ('\u0CB1', 0x1B, 3); // RRA
2134 AddCharMap ('\u0CB3', 0x1B, 3); // LLA
2135 AddCharMap ('\u0CDE', 0x1B, 3); // FA
2138 AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
2142 fillIndex [0x1C] = 2;
2143 for (int i = 0x0D02; i < 0x0D61; i++)
2144 // FIXME: I avoided MSCompatUnicodeTable usage
2145 // here (it results in recursion). So check if
2146 // using NonSpacingMark makes sense or not.
2147 if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark)
2148 // if (!MSCompatUnicodeTable.IsIgnorable ((char) i))
2149 AddCharMap ((char) i, 0x1C, 1);
2151 // Thai ... note that it breaks 0x1E wall after E2B!
2152 // Also, all Thai characters have level 2 value 3.
2153 fillIndex [0x1E] = 2;
2154 for (int i = 0xE40; i <= 0xE44; i++)
2155 AddCharMap ((char) i, 0x1E, 1, 3);
2156 for (int i = 0xE01; i < 0xE2B; i++)
2157 AddCharMap ((char) i, 0x1E, 6, 3);
2158 fillIndex [0x1F] = 5;
2159 for (int i = 0xE2B; i < 0xE30; i++)
2160 AddCharMap ((char) i, 0x1F, 6, 3);
2161 fillIndex [0x1F] = 0x1E;
2162 for (int i = 0xE30; i < 0xE3B; i++)
2163 AddCharMap ((char) i, 0x1F, 1, 3);
2164 // some Thai characters remains.
2165 char [] specialThai = new char [] {'\u0E45', '\u0E46',
2166 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
2167 foreach (char c in specialThai)
2168 AddCharMap (c, 0x1F, 1);
2171 fillIndex [0x1F] = 2;
2172 for (int i = 0xE80; i < 0xEDF; i++)
2173 if (Char.IsLetter ((char) i))
2174 AddCharMap ((char) i, 0x1F, 1);
2176 // Georgian. orderedGeorgian is from UCA DUCET.
2177 fillIndex [0x21] = 5;
2178 for (int i = 0; i < orderedGeorgian.Length; i++) {
2179 char c = orderedGeorgian [i];
2180 if (map [(int) c].Defined)
2182 AddCharMap (c, 0x21, 0);
2184 AddCharMap ((char) (c - 0x30), 0x21, 0, 0x12);
2185 fillIndex [0x21] += 5;
2189 fillIndex [0x22] = 2;
2190 int kanaOffset = 0x3041;
2191 byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1};
2193 for (int gyo = 0; gyo < 9; gyo++) {
2194 for (int dan = 0; dan < 5; dan++) {
2195 if (gyo == 7 && dan % 2 == 1) {
2198 kanaOffset -= 2; // There is no space for yi and ye.
2201 int cp = kanaOffset + dan * kanaLines [gyo];
2202 // small lines (a-gyo, ya-gyo)
2203 if (gyo == 0 || gyo == 7) {
2204 AddKanaMap (cp, 1); // small
2205 AddKanaMap (cp + 1, 1);
2208 AddKanaMap (cp, kanaLines [gyo]);
2212 // add small 'Tsu' (before normal one)
2213 AddKanaMap (0x3063, 1);
2217 fillIndex [0x22] += 3;
2218 kanaOffset += 5 * kanaLines [gyo];
2221 // Wa-gyo is almost special, so I just manually add.
2222 AddLetterMap ((char) 0x308E, 0x22, 0);
2223 AddLetterMap ((char) (0x308E + 0x60), 0x22, 0);
2224 AddLetterMap ((char) 0x308F, 0x22, 0);
2225 AddLetterMap ((char) (0x308F + 0x60), 0x22, 0);
2227 AddLetterMap ((char) 0x3090, 0x22, 0);
2228 AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0);
2229 fillIndex [0x22] += 2;
2230 // no "Wu" in Japanese.
2231 AddLetterMap ((char) 0x3091, 0x22, 0);
2232 AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0);
2234 AddLetterMap ((char) 0x3092, 0x22, 0);
2235 AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0);
2237 fillIndex [0x22] = 0x80;
2238 AddLetterMap ((char) 0x3093, 0x22, 0);
2239 AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0);
2241 // JIS Japanese square chars.
2242 fillIndex [0x22] = 0x97;
2243 jisJapanese.Sort (JISComparer.Instance);
2244 foreach (JISCharacter j in jisJapanese)
2245 if (0x3300 <= j.CP && j.CP <= 0x3357)
2246 AddCharMap ((char) j.CP, 0x22, 1);
2247 // non-JIS Japanese square chars.
2248 nonJisJapanese.Sort (NonJISComparer.Instance);
2249 foreach (NonJISCharacter j in nonJisJapanese)
2250 AddCharMap ((char) j.CP, 0x22, 1);
2253 fillIndex [0x23] = 0x02;
2254 for (int i = 0x3105; i <= 0x312C; i++)
2255 AddCharMap ((char) i, 0x23, 1);
2257 // Estrangela: ancient Syriac
2258 fillIndex [0x24] = 0x0B;
2259 // FIXME: is 0x71E really alternative form?
2260 ArrayList syriacAlternatives = new ArrayList (
2261 new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727});
2262 for (int i = 0x0710; i <= 0x072C; i++) {
2263 if (i == 0x0711) // NonSpacingMark
2265 if (syriacAlternatives.Contains (i))
2267 AddCharMap ((char) i, 0x24, 4);
2272 foreach (int cp in syriacAlternatives)
2273 map [cp] = new CharMapEntry (0x24,
2274 (byte) (map [cp - 1].Level1 + 2),
2276 // FIXME: Syriac NonSpacingMark should go here.
2279 // FIXME: it turned out that it does not look like UCA
2280 fillIndex [0x24] = 0x6E;
2281 for (int i = 0; i < orderedThaana.Length; i++) {
2282 char c = orderedThaana [i];
2283 if (IsIgnorableNonSpacing ((int) c))
2285 AddCharMap (c, 0x24, 2);
2286 if (c == '\u0782') // SPECIAL CASE: why?
2287 fillIndex [0x24] += 2;
2291 // FIXME: Add more culture-specific letters (that are
2292 // not supported in Windows collation) here.
2294 // Surrogate ... they are computed.
2299 // Unlike UCA Windows Hangul sequence mixes Jongseong
2300 // with Choseong sequence as well as Jungseong,
2301 // adjusted to have the same primary weight for the
2302 // same base character. So it is impossible to compute
2305 // Here I introduce an ordered sequence of mixed
2306 // 'commands' and 'characters' that is similar to
2308 // - ',' increases primary weight.
2309 // - [A B] means a range, increasing index
2310 // - {A B} means a range, without increasing index
2311 // - '=' is no operation (it means the characters
2312 // of both sides have the same weight).
2313 // - '>' inserts a Hangul Syllable block that
2314 // contains 0x251 characters.
2315 // - '<' decreases the index
2316 // - '0'-'9' means skip count
2317 // - whitespaces are ignored
2320 string hangulSequence =
2321 + "\u1100=\u11A8 > \u1101=\u11A9 >"
2322 + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >"
2323 + "<{\u1113 \u1116}, \u3165,"
2324 + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
2325 + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
2326 + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
2327 + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
2328 + "[\u11D1 \u11D2], \u11B2,"
2329 + "[\u11D3 \u11D5], \u11B3,"
2330 + "[\u11D6 \u11D7], \u11B4, \u11B5,"
2331 + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
2332 + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
2333 + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
2334 + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
2335 + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
2336 + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
2337 + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
2338 + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
2339 + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
2340 + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
2341 + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
2342 + "\u11F1,, \u11F2,,,"
2343 + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
2344 + "<\u114D, \u110D,, >"
2345 + "<{\u114E \u1151},, \u110E=\u11BE,, >"
2346 + "<{\u1152 \u1155},,, \u110F=\u11BF >"
2347 + "\u1110=\u11C0 > \u1111=\u11C1 >"
2348 + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >"
2349 + "<\u1158=\u1159=\u115F, \u3185, \u11F9,"
2353 byte hangulCat = 0x52;
2354 fillIndex [hangulCat] = 0x2;
2356 int syllableBlock = 0;
2357 for (int n = 0; n < hangulSequence.Length; n++) {
2358 char c = hangulSequence [n];
2360 if (Char.IsWhiteSpace (c))
2366 IncrementSequentialIndex (ref hangulCat);
2369 if (fillIndex [hangulCat] == 2)
2370 throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate).");
2371 fillIndex [hangulCat]--;
2374 IncrementSequentialIndex (ref hangulCat);
2375 for (int l = 0; l < 0x15; l++)
2376 for (int v = 0; v < 0x1C; v++) {
2378 (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0);
2379 IncrementSequentialIndex (ref hangulCat);
2384 start = hangulSequence [n + 1];
2385 end = hangulSequence [n + 3];
2386 for (int i = start; i <= end; i++) {
2387 AddCharMap ((char) i, hangulCat, 0);
2389 IncrementSequentialIndex (ref hangulCat);
2391 n += 4; // consumes 5 characters for this operation
2394 start = hangulSequence [n + 1];
2395 end = hangulSequence [n + 3];
2396 for (int i = start; i <= end; i++)
2397 AddCharMap ((char) i, hangulCat, 0);
2398 n += 4; // consumes 5 characters for this operation
2401 AddCharMap (c, hangulCat, 0);
2407 for (int i = 0x3200; i < 0x3300; i++) {
2408 if (IsIgnorable (i) || map [i].Defined)
2412 if (decompLength [i] == 4 &&
2413 decompValues [decompIndex [i]] == '(')
2414 ch = decompIndex [i] + 1;
2416 else if (decompLength [i] == 2 &&
2417 decompValues [decompIndex [i] + 1] == '\u1161')
2418 ch = decompIndex [i];
2419 else if (decompLength [i] == 1)
2420 ch = decompIndex [i];
2423 ch = decompValues [ch];
2424 if (ch < 0x1100 || 0x1200 < ch &&
2425 ch < 0xAC00 || 0xD800 < ch)
2429 int offset = i < 0x3260 ? 1 : 0;
2430 if (0x326E <= i && i <= 0x3273)
2433 map [i] = new CharMapEntry (map [ch].Category,
2434 (byte) (map [ch].Level1 + offset),
2436 // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
2442 // Letterlike characters and CJK compatibility square
2443 sortableCharNames.Sort (StringDictionaryValueComparer.Instance);
2444 int [] counts = new int ['Z' - 'A' + 1];
2445 char [] namedChars = new char [sortableCharNames.Count];
2447 foreach (DictionaryEntry de in sortableCharNames) {
2448 counts [((string) de.Value) [0] - 'A']++;
2449 namedChars [nCharNames++] = (char) ((int) de.Key);
2451 nCharNames = 0; // reset
2452 for (int a = 0; a < counts.Length; a++) {
2453 fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]);
2454 for (int i = 0; i < counts [a]; i++)
2455 //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames]));
2456 AddCharMap (namedChars [nCharNames++], 0xE, 1);
2459 // CJK unified ideograph.
2461 fillIndex [cjkCat] = 0x2;
2462 for (int cp = 0x4E00; cp <= 0x9FBB; cp++)
2463 if (!IsIgnorable (cp))
2464 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2465 // CJK Extensions goes here.
2466 // LAMESPEC: With this Windows style CJK layout, it is
2467 // impossible to add more CJK ideograph i.e. 0x9FA6-
2468 // 0x9FBB can never be added w/o breaking compat.
2469 for (int cp = 0xF900; cp <= 0xFA2D; cp++)
2470 if (!IsIgnorable (cp))
2471 AddCharMapGroupCJK ((char) cp, ref cjkCat);
2473 // PrivateUse ... computed.
2474 // remaining Surrogate ... computed.
2476 #region Special "biggest" area (FF FF)
2477 fillIndex [0xFF] = 0xFF;
2478 char [] specialBiggest = new char [] {
2479 '\u3005', '\u3031', '\u3032', '\u309D',
2480 '\u309E', '\u30FC', '\u30FD', '\u30FE',
2481 '\uFE7C', '\uFE7D', '\uFF70'};
2482 foreach (char c in specialBiggest)
2483 AddCharMap (c, 0xFF, 0);
2486 #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
2487 // non-alphanumeric ASCII except for: + - < = > '
2488 for (int i = 0x21; i < 0x7F; i++) {
2489 if (Char.IsLetterOrDigit ((char) i)
2490 || "+-<=>'".IndexOf ((char) i) >= 0)
2491 continue; // they are not added here.
2492 AddCharMapGroup2 ((char) i, 0x7, 1, 0);
2493 // Insert 3001 after ',' and 3002 after '.'
2495 AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
2496 else if (i == 0x2E) {
2498 AddCharMapGroup2 ('\u3002', 0x7, 1, 0);
2501 AddCharMap ('\uFE30', 0x7, 1, 0);
2505 #region 07 - Punctuations and something else
2506 for (int i = 0xA0; i < char.MaxValue; i++) {
2507 if (IsIgnorable (i))
2510 // FIXME: actually those reset should not be
2511 // done but here I put for easy goal.
2513 fillIndex [0x7] = 0xE2;
2515 fillIndex [0x7] = 0x77;
2527 switch (Char.GetUnicodeCategory ((char) i)) {
2528 case UnicodeCategory.OtherPunctuation:
2529 case UnicodeCategory.ClosePunctuation:
2530 case UnicodeCategory.OpenPunctuation:
2531 case UnicodeCategory.InitialQuotePunctuation:
2532 case UnicodeCategory.FinalQuotePunctuation:
2533 case UnicodeCategory.ModifierSymbol:
2534 // SPECIAL CASES: // 0xA
2535 if (0x2020 <= i && i <= 0x2042)
2537 AddCharMapGroup ((char) i, 0x7, 1, 0);
2540 if (i == 0xA6) // SPECIAL CASE. FIXME: why?
2541 goto case UnicodeCategory.OtherPunctuation;
2546 for (int i = 0x2400; i <= 0x2421; i++)
2547 AddCharMap ((char) i, 0x7, 1, 0);
2550 // FIXME: for 07 xx we need more love.
2552 // Characters w/ diacritical marks (NFKD)
2553 for (int i = 0; i <= char.MaxValue; i++) {
2554 if (map [i].Defined || IsIgnorable (i))
2556 if (decompIndex [i] == 0)
2559 int start = decompIndex [i];
2560 int primaryChar = decompValues [start];
2563 int length = decompLength [i];
2564 // special processing for parenthesized ones.
2566 decompValues [start] == '(' &&
2567 decompValues [start + 2] == ')') {
2568 primaryChar = decompValues [start + 1];
2572 if (map [primaryChar].Level1 == 0)
2575 for (int l = 1; l < length; l++) {
2576 int c = decompValues [start + l];
2577 if (map [c].Level1 != 0)
2579 secondary += diacritical [c];
2583 map [i] = new CharMapEntry (
2584 map [primaryChar].Category,
2585 map [primaryChar].Level1,
2590 // category 08 - symbols
2591 fillIndex [0x8] = 2;
2592 // Here Windows mapping is not straightforward. It is
2593 // not based on computation but seems manual sorting.
2594 AddCharMapGroup ('+', 0x8, 1, 0); // plus
2595 AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
2596 AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
2597 AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
2598 AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
2599 AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
2600 AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
2601 AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
2602 AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
2603 AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
2604 AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
2605 AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
2606 AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
2608 for (int cp = 0; cp < 0x2300; cp++) {
2610 cp = 0x2200; // skip to 2200
2611 if (cp == 0xAC) // SPECIAL CASE: skip
2613 if (!map [cp].Defined &&
2614 // Char.GetUnicodeCategory ((char) cp) ==
2615 // UnicodeCategory.MathSymbol)
2616 Char.IsSymbol ((char) cp))
2617 AddCharMapGroup ((char) cp, 0x8, 1, 0);
2618 // SPECIAL CASES: no idea why Windows sorts as such
2621 AddCharMap ('\u227B', 0x8, 1, 0);
2622 AddCharMap ('\u22B1', 0x8, 1, 0);
2625 AddCharMapGroup ('\u00AB', 0x8, 1, 0);
2626 AddCharMapGroup ('\u226A', 0x8, 1, 0);
2627 AddCharMapGroup ('\u00BB', 0x8, 1, 0);
2628 AddCharMapGroup ('\u226B', 0x8, 1, 0);
2631 AddCharMap ('\u01C0', 0x8, 1, 0);
2632 AddCharMap ('\u01C1', 0x8, 1, 0);
2633 AddCharMap ('\u01C2', 0x8, 1, 0);
2638 #region Level2 adjustment
2640 diacritical [0x624] = 0x5;
2641 diacritical [0x626] = 0x7;
2642 diacritical [0x622] = 0x9;
2643 diacritical [0x623] = 0xA;
2644 diacritical [0x625] = 0xB;
2645 diacritical [0x649] = 0x5; // 'alif maqs.uurah
2646 diacritical [0x64A] = 0x7; // Yaa'
2648 for (int i = 0; i < char.MaxValue; i++) {
2650 byte cat = map [i].Category;
2652 case 0xE: // Latin diacritics
2653 case 0x22: // Japanese: circled characters
2654 mod = diacritical [i];
2656 case 0x13: // Arabic
2657 if (diacritical [i] == 0)
2658 mod = 0x8; // default for arabic
2661 if (0x52 <= cat && cat <= 0x7F) // Hangul
2662 mod = diacritical [i];
2664 map [i] = new CharMapEntry (
2665 cat, map [i].Level1, mod);
2669 // FIXME: this is hack but those NonSpacingMark
2670 // characters and still undefined are likely to
2672 for (int i = 0; i < char.MaxValue; i++)
2673 if (!map [i].Defined &&
2675 Char.GetUnicodeCategory ((char) i) ==
2676 UnicodeCategory.NonSpacingMark)
2677 AddCharMap ((char) i, 1, 1);
2679 // FIXME: this is hack but those Symbol characters
2680 // are likely to fall into 0xA category.
2681 for (int i = 0; i < char.MaxValue; i++)
2682 if (!map [i].Defined &&
2684 Char.IsSymbol ((char) i))
2685 AddCharMap ((char) i, 0xA, 1);
2688 private void IncrementSequentialIndex (ref byte hangulCat)
2690 fillIndex [hangulCat]++;
2691 if (fillIndex [hangulCat] == 0) { // overflown
2693 fillIndex [hangulCat] = 0x2;
2697 // Reset fillIndex to fixed value and call AddLetterMap().
2698 private void AddAlphaMap (char c, byte category, byte alphaWeight)
2700 fillIndex [category] = alphaWeight;
2701 AddLetterMap (c, category, 0);
2703 ArrayList al = latinMap [c] as ArrayList;
2707 foreach (int cp in al)
2708 AddLetterMap ((char) cp, category, 0);
2711 private void AddKanaMap (int i, byte voices)
2713 for (byte b = 0; b < voices; b++) {
2714 char c = (char) (i + b);
2715 byte arg = (byte) (b > 0 ? b + 2 : 0);
2717 AddLetterMapCore (c, 0x22, 0, arg);
2719 AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
2723 private void AddLetterMap (char c, byte category, byte updateCount)
2725 AddLetterMapCore (c, category, updateCount, 0);
2728 private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
2731 // <small> updates index
2732 c2 = ToSmallForm (c);
2734 AddCharMapGroup (c2, category, updateCount, level2);
2735 c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
2736 if (c2 != c && !map [(int) c2].Defined)
2737 AddLetterMapCore (c2, category, 0, level2);
2738 bool doUpdate = true;
2739 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2742 AddCharMapGroup (c, category, 0, level2);
2744 fillIndex [category] += updateCount;
2747 private bool AddCharMap (char c, byte category, byte increment)
2749 return AddCharMap (c, category, increment, 0);
2752 private bool AddCharMap (char c, byte category, byte increment, byte alt)
2754 if (IsIgnorable ((int) c) || map [(int) c].Defined)
2755 return false; // do nothing
2756 map [(int) c] = new CharMapEntry (category,
2757 category == 1 ? alt : fillIndex [category],
2758 category == 1 ? fillIndex [category] : alt);
2759 fillIndex [category] += increment;
2763 private void AddCharMapGroupTail (char c, byte category, byte updateCount)
2765 char c2 = ToSmallFormTail (c);
2767 AddCharMap (c2, category, updateCount, 0);
2769 AddCharMap (c, category, updateCount, 0);
2771 c2 = ToFullWidthTail (c);
2773 AddCharMapGroupTail (c2, category, updateCount);
2777 // Adds characters to table in the order below
2778 // (+ increases weight):
2782 // <full> | <super> | <sub>
2783 // <circle> | <wide> (| <narrow>)
2787 // level2 is fixed (does not increase).
2788 int [] sameWeightItems = new int [] {
2789 DecompositionFraction,
2793 DecompositionCircle,
2795 DecompositionNarrow,
2797 private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
2799 if (map [(int) c].Defined)
2802 char small = char.MinValue;
2803 char vertical = char.MinValue;
2804 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2806 object smv = nfkd [(byte) DecompositionSmall];
2808 small = (char) ((int) smv);
2809 object vv = nfkd [(byte) DecompositionVertical];
2811 vertical = (char) ((int) vv);
2814 // <small> updates index
2815 if (small != char.MinValue)
2816 AddCharMap (small, category, updateCount);
2819 AddCharMap (c, category, 0, level2);
2822 foreach (int weight in sameWeightItems) {
2823 object wv = nfkd [(byte) weight];
2825 AddCharMap ((char) ((int) wv), category, 0, level2);
2829 // update index here.
2830 fillIndex [category] += updateCount;
2832 if (vertical != char.MinValue)
2833 AddCharMap (vertical, category, updateCount, level2);
2836 private void AddCharMapCJK (char c, ref byte category)
2838 AddCharMap (c, category, 0, 0);
2839 IncrementSequentialIndex (ref category);
2841 // Special. I wonder why but Windows skips 9E F9.
2842 if (category == 0x9E && fillIndex [category] == 0xF9)
2843 IncrementSequentialIndex (ref category);
2846 private void AddCharMapGroupCJK (char c, ref byte category)
2848 AddCharMapCJK (c, ref category);
2850 // LAMESPEC: see below.
2851 if (c == '\u5B78') {
2852 AddCharMapCJK ('\u32AB', ref category);
2853 AddCharMapCJK ('\u323B', ref category);
2855 if (c == '\u52DE') {
2856 AddCharMapCJK ('\u3298', ref category);
2857 AddCharMapCJK ('\u3238', ref category);
2860 AddCharMapCJK ('\u32A2', ref category);
2862 // Especially this mapping order totally does
2863 // not make sense to me.
2864 AddCharMapCJK ('\u32A9', ref category);
2866 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2869 for (byte weight = 0; weight <= 0x12; weight++) {
2870 object wv = nfkd [weight];
2875 // Special: they are ignored in this area.
2876 // FIXME: check if it is sane
2877 if (0xF900 <= w && w <= 0xFAD9)
2879 // LAMESPEC: on Windows some of CJK characters
2880 // in 3200-32B0 are incorrectly mapped. They
2881 // mix Chinise and Japanese Kanji when
2882 // ordering those characters.
2884 case 0x32A2: case 0x3298: case 0x3238:
2885 case 0x32A9: case 0x323B: case 0x32AB:
2889 AddCharMapCJK ((char) w, ref category);
2893 // For now it is only for 0x7 category.
2894 private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
2896 char small = char.MinValue;
2897 char vertical = char.MinValue;
2898 Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
2900 object smv = nfkd [(byte) DecompositionSmall];
2902 small = (char) ((int) smv);
2903 object vv = nfkd [(byte) DecompositionVertical];
2905 vertical = (char) ((int) vv);
2908 // <small> updates index
2909 if (small != char.MinValue)
2910 // SPECIAL CASE excluded (FIXME: why?)
2911 if (small != '\u2024')
2912 AddCharMap (small, category, updateCount);
2915 AddCharMap (c, category, updateCount, level2);
2917 // Since nfkdMap is problematic to have two or more
2918 // NFKD to an identical character, here I iterate all.
2919 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2920 if (decompLength [c2] == 1 &&
2921 (int) (decompValues [decompIndex [c2]]) == (int) c) {
2922 switch (decompType [c2]) {
2923 case DecompositionCompat:
2924 AddCharMap ((char) c2, category, updateCount, level2);
2930 if (vertical != char.MinValue)
2931 // SPECIAL CASE excluded (FIXME: why?)
2932 if (vertical != '\uFE33' && vertical != '\uFE34')
2933 AddCharMap (vertical, category, updateCount, level2);
2936 private void AddArabicCharMap (char c)
2939 byte updateCount = 1;
2943 AddCharMap (c, category, 0, level2);
2945 // Since nfkdMap is problematic to have two or more
2946 // NFKD to an identical character, here I iterate all.
2947 for (int c2 = 0; c2 < char.MaxValue; c2++) {
2948 if (decompLength [c2] == 0)
2950 int idx = decompIndex [c2] + decompLength [c2] - 1;
2951 if ((int) (decompValues [idx]) == (int) c)
2952 AddCharMap ((char) c2, category,
2955 fillIndex [category] += updateCount;
2958 char ToFullWidth (char c)
2960 return ToDecomposed (c, DecompositionFull, false);
2963 char ToFullWidthTail (char c)
2965 return ToDecomposed (c, DecompositionFull, true);
2968 char ToSmallForm (char c)
2970 return ToDecomposed (c, DecompositionSmall, false);
2973 char ToSmallFormTail (char c)
2975 return ToDecomposed (c, DecompositionSmall, true);
2978 char ToDecomposed (char c, byte d, bool tail)
2980 if (decompType [(int) c] != d)
2982 int idx = decompIndex [(int) c];
2984 idx += decompLength [(int) c] - 1;
2985 return (char) decompValues [idx];
2988 bool ExistsJIS (int cp)
2990 foreach (JISCharacter j in jisJapanese)
2998 #region Level 3 properties (Case/Width)
3000 private byte ComputeLevel3Weight (char c)
3002 byte b = ComputeLevel3WeightRaw (c);
3003 return b > 0 ? (byte) (b + 2) : b;
3006 private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
3009 if ('\u3192' <= c && c <= '\u319F')
3012 if ('\u11A8' <= c && c <= '\u11F9')
3014 if ('\uFFA0' <= c && c <= '\uFFDC')
3016 if ('\u3130' <= c && c <= '\u3164')
3018 if ('\u3165' <= c && c <= '\u318E')
3021 if ('\u2776' <= c && c <= '\u277F')
3023 if ('\u2780' <= c && c <= '\u2789')
3025 if ('\u2776' <= c && c <= '\u2793')
3027 if ('\u2160' <= c && c <= '\u216F')
3029 if ('\u2181' <= c && c <= '\u2182')
3032 if ('\u2135' <= c && c <= '\u2138')
3034 if ('\uFE80' <= c && c < '\uFF00') {
3035 // 2(Isolated)/8(Final)/0x18(Medial)
3036 switch (decompType [(int) c]) {
3037 case DecompositionIsolated:
3039 case DecompositionFinal:
3041 case DecompositionMedial:
3046 // actually I dunno the reason why they have weights.
3069 switch (decompType [(int) c]) {
3070 case DecompositionWide: // <wide>
3071 case DecompositionSub: // <sub>
3072 case DecompositionSuper: // <super>
3073 ret |= decompType [(int) c];
3076 if (isSmallCapital [(int) c]) // grep "SMALL CAPITAL"
3078 if (isUppercase [(int) c]) // DerivedCoreProperties
3088 static bool IsIgnorable (int i)
3090 if (unicodeAge [i] >= 3.1)
3092 switch (char.GetUnicodeCategory ((char) i)) {
3093 case UnicodeCategory.OtherNotAssigned:
3094 case UnicodeCategory.Format:
3101 // FIXME: In the future use DerivedAge.txt to examine character
3102 // versions and set those ones that have higher version than
3103 // 1.0 as ignorable.
3104 static bool IsIgnorable (int i)
3108 // I guess, those characters are added between
3109 // Unicode 1.0 (LCMapString) and Unicode 3.1
3110 // (UnicodeCategory), so they used to be
3111 // something like OtherNotAssigned as of Unicode 1.1.
3112 case 0x2df: case 0x387:
3113 case 0x3d7: case 0x3d8: case 0x3d9:
3114 case 0x3f3: case 0x3f4: case 0x3f5: case 0x3f6:
3115 case 0x400: case 0x40d: case 0x450: case 0x45d:
3116 case 0x587: case 0x58a: case 0x5c4: case 0x640:
3117 case 0x653: case 0x654: case 0x655: case 0x66d:
3119 case 0x1e9b: case 0x202f: case 0x20ad:
3120 case 0x20ae: case 0x20af:
3121 case 0x20e2: case 0x20e3:
3122 case 0x2139: case 0x213a: case 0x2183:
3123 case 0x2425: case 0x2426: case 0x2619:
3124 case 0x2670: case 0x2671: case 0x3007:
3125 case 0x3190: case 0x3191:
3126 case 0xfffc: case 0xfffd:
3128 // exceptional characters filtered by the
3129 // following conditions. Originally those exceptional
3130 // ranges are incorrect (they should not be ignored)
3131 // and most of those characters are unfortunately in
3133 case 0x4d8: case 0x4d9:
3134 case 0x4e8: case 0x4e9:
3136 case 0x3036: case 0x303f:
3137 case 0x337b: case 0xfb1e:
3142 // The whole Sinhala characters.
3143 0x0D82 <= i && i <= 0x0DF4
3144 // The whole Tibetan characters.
3145 || 0x0F00 <= i && i <= 0x0FD1
3146 // The whole Myanmar characters.
3147 || 0x1000 <= i && i <= 0x1059
3148 // The whole Etiopic, Cherokee,
3149 // Canadian Syllablic, Ogham, Runic,
3150 // Tagalog, Hanunoo, Philippine,
3151 // Buhid, Tagbanwa, Khmer and Mongorian
3153 || 0x1200 <= i && i <= 0x1DFF
3154 // Greek extension characters.
3155 || 0x1F00 <= i && i <= 0x1FFF
3156 // The whole Braille characters.
3157 || 0x2800 <= i && i <= 0x28FF
3158 // CJK radical characters.
3159 || 0x2E80 <= i && i <= 0x2EF3
3160 // Kangxi radical characters.
3161 || 0x2F00 <= i && i <= 0x2FD5
3162 // Ideographic description characters.
3163 || 0x2FF0 <= i && i <= 0x2FFB
3164 // Bopomofo letter and final
3165 || 0x31A0 <= i && i <= 0x31B7
3166 // White square with quadrant characters.
3167 || 0x25F0 <= i && i <= 0x25F7
3168 // Ideographic telegraph symbols.
3169 || 0x32C0 <= i && i <= 0x32CB
3170 || 0x3358 <= i && i <= 0x3370
3171 || 0x33E0 <= i && i <= 0x33FF
3172 // The whole YI characters.
3173 || 0xA000 <= i && i <= 0xA48C
3174 || 0xA490 <= i && i <= 0xA4C6
3175 // American small ligatures
3176 || 0xFB13 <= i && i <= 0xFB17
3177 // hebrew, arabic, variation selector.
3178 || 0xFB1D <= i && i <= 0xFE2F
3179 // Arabic ligatures.
3180 || 0xFEF5 <= i && i <= 0xFEFC
3181 // FIXME: why are they excluded?
3182 || 0x01F6 <= i && i <= 0x01F9
3183 || 0x0218 <= i && i <= 0x0233
3184 || 0x02A9 <= i && i <= 0x02AD
3185 || 0x02EA <= i && i <= 0x02EE
3186 || 0x0349 <= i && i <= 0x036F
3187 || 0x0488 <= i && i <= 0x048F
3188 || 0x04D0 <= i && i <= 0x04FF
3189 || 0x0500 <= i && i <= 0x050F // actually it matters only for 2.0
3190 || 0x06D6 <= i && i <= 0x06ED
3191 || 0x06FA <= i && i <= 0x06FE
3192 || 0x2048 <= i && i <= 0x204D
3193 || 0x20e4 <= i && i <= 0x20ea
3194 || 0x213C <= i && i <= 0x214B
3195 || 0x21EB <= i && i <= 0x21FF
3196 || 0x22F2 <= i && i <= 0x22FF
3197 || 0x237B <= i && i <= 0x239A
3198 || 0x239B <= i && i <= 0x23CF
3199 || 0x24EB <= i && i <= 0x24FF
3200 || 0x2596 <= i && i <= 0x259F
3201 || 0x25F8 <= i && i <= 0x25FF
3202 || 0x2672 <= i && i <= 0x2689
3203 || 0x2768 <= i && i <= 0x2775
3204 || 0x27d0 <= i && i <= 0x27ff
3205 || 0x2900 <= i && i <= 0x2aff
3206 || 0x3033 <= i && i <= 0x303F
3207 || 0x31F0 <= i && i <= 0x31FF
3208 || 0x3250 <= i && i <= 0x325F
3209 || 0x32B1 <= i && i <= 0x32BF
3210 || 0x3371 <= i && i <= 0x337B
3211 || 0xFA30 <= i && i <= 0xFA6A
3215 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3217 case UnicodeCategory.PrivateUse:
3218 case UnicodeCategory.Surrogate:
3220 // ignored by nature
3221 case UnicodeCategory.Format:
3222 case UnicodeCategory.OtherNotAssigned:
3229 // To check IsIgnorable sanity, try the driver below under MS.NET.
3232 public static void Main ()
3234 for (int i = 0; i <= char.MaxValue; i++)
3235 Dump (i, IsIgnorable (i));
3238 static void Dump (int i, bool ignore)
3240 switch (Char.GetUnicodeCategory ((char) i)) {
3241 case UnicodeCategory.PrivateUse:
3242 case UnicodeCategory.Surrogate:
3243 return; // check nothing
3247 string s2 = new string ((char) i, 10);
3248 int ret = CultureInfo.InvariantCulture.CompareInfo.Compare (s1, s2, CompareOptions.IgnoreCase);
3249 if ((ret == 0) == ignore)
3251 Console.WriteLine ("{0} : {1:x} {2}", ignore ? "o" : "x", i, Char.GetUnicodeCategory ((char) i));
3254 #endregion // IsIgnorable
3256 #region IsIgnorableSymbol
3257 static bool IsIgnorableSymbol (int i)
3259 if (IsIgnorable (i))
3264 case 0x00b5: case 0x01C0: case 0x01C1:
3265 case 0x01C2: case 0x01C3: case 0x01F6:
3266 case 0x01F7: case 0x01F8: case 0x01F9:
3267 case 0x02D0: case 0x02EE: case 0x037A:
3268 case 0x03D7: case 0x03F3:
3269 case 0x0400: case 0x040d:
3270 case 0x0450: case 0x045d:
3271 case 0x048C: case 0x048D:
3272 case 0x048E: case 0x048F:
3273 case 0x0587: case 0x0640: case 0x06E5:
3274 case 0x06E6: case 0x06FA: case 0x06FB:
3275 case 0x06FC: case 0x093D: case 0x0950:
3276 case 0x1E9B: case 0x2139: case 0x3006:
3277 case 0x3033: case 0x3034: case 0x3035:
3278 case 0xFE7E: case 0xFE7F:
3280 case 0x16EE: case 0x16EF: case 0x16F0:
3282 case 0x2183: // ROMAN NUMERAL REVERSED ONE HUNDRED
3283 case 0x3007: // IDEOGRAPHIC NUMBER ZERO
3284 case 0x3038: // HANGZHOU NUMERAL TEN
3285 case 0x3039: // HANGZHOU NUMERAL TWENTY
3286 case 0x303a: // HANGZHOU NUMERAL THIRTY
3292 case 0x02B9: case 0x02BA: case 0x02C2:
3293 case 0x02C3: case 0x02C4: case 0x02C5:
3294 case 0x02C8: case 0x02CC: case 0x02CD:
3295 case 0x02CE: case 0x02CF: case 0x02D2:
3296 case 0x02D3: case 0x02D4: case 0x02D5:
3297 case 0x02D6: case 0x02D7: case 0x02DE:
3298 case 0x02E5: case 0x02E6: case 0x02E7:
3299 case 0x02E8: case 0x02E9:
3300 case 0x309B: case 0x309C:
3302 case 0x055A: // American Apos
3303 case 0x05C0: // Hebrew Punct
3304 case 0x0E4F: // Thai FONGMAN
3305 case 0x0E5A: // Thai ANGKHANKHU
3306 case 0x0E5B: // Thai KHOMUT
3308 case 0x09F2: // Bengali Rupee Mark
3309 case 0x09F3: // Bengali Rupee Sign
3311 case 0x221e: // INF.
3320 if (0xFE70 <= i && i < 0xFE7C // ARABIC LIGATURES B
3322 || 0x0501 <= i && i <= 0x0510 // CYRILLIC KOMI
3323 || 0xFA30 <= i && i < 0xFA70 // CJK COMPAT
3328 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3330 case UnicodeCategory.Surrogate:
3331 return false; // inconsistent
3333 case UnicodeCategory.SpacingCombiningMark:
3334 case UnicodeCategory.EnclosingMark:
3335 case UnicodeCategory.NonSpacingMark:
3336 case UnicodeCategory.PrivateUse:
3338 if (0x064B <= i && i <= 0x0652) // Arabic
3342 case UnicodeCategory.Format:
3343 case UnicodeCategory.OtherNotAssigned:
3350 // latin in a circle
3351 0x249A <= i && i <= 0x24E9
3352 || 0x2100 <= i && i <= 0x2132
3354 || 0x3196 <= i && i <= 0x31A0
3356 || 0x3200 <= i && i <= 0x321C
3358 || 0x322A <= i && i <= 0x3243
3360 || 0x3260 <= i && i <= 0x32B0
3361 || 0x32D0 <= i && i <= 0x3357
3362 || 0x337B <= i && i <= 0x33DD
3364 use = !Char.IsLetterOrDigit ((char) i);
3368 // This "Digit" rule is mystery.
3369 // It filters some symbols out.
3370 if (Char.IsLetterOrDigit ((char) i))
3372 if (Char.IsNumber ((char) i))
3374 if (Char.IsControl ((char) i)
3375 || Char.IsSeparator ((char) i)
3376 || Char.IsPunctuation ((char) i))
3378 if (Char.IsSymbol ((char) i))
3381 // FIXME: should check more
3386 // To check IsIgnorableSymbol sanity, try the driver below under MS.NET.
3388 public static void Main ()
3390 CompareInfo ci = CultureInfo.InvariantCulture.CompareInfo;
3391 for (int i = 0; i <= char.MaxValue; i++) {
3392 UnicodeCategory uc = Char.GetUnicodeCategory ((char) i);
3393 if (uc == UnicodeCategory.Surrogate)
3396 bool ret = IsIgnorableSymbol (i);
3398 string s1 = "TEST ";
3399 string s2 = "TEST " + (char) i;
3401 int result = ci.Compare (s1, s2, CompareOptions.IgnoreSymbols);
3403 if (ret != (result == 0))
3404 Console.WriteLine ("{0} : {1:x}[{2}]({3})",
3405 ret ? "should not ignore" :
3414 static bool IsIgnorableNonSpacing (int i)
3416 if (IsIgnorable (i))
3420 case 0x02C8: case 0x02DE: case 0x0559: case 0x055A:
3421 case 0x05C0: case 0x0ABD: case 0x0CD5: case 0x0CD6:
3422 case 0x309B: case 0x309C: case 0xFF9E: case 0xFF9F:
3424 case 0x02D0: case 0x0670: case 0x0901: case 0x0902:
3425 case 0x094D: case 0x0962: case 0x0963: case 0x0A41:
3426 case 0x0A42: case 0x0A47: case 0x0A48: case 0x0A4B:
3427 case 0x0A4C: case 0x0A81: case 0x0A82: case 0x0B82:
3428 case 0x0BC0: case 0x0CBF: case 0x0CC6: case 0x0CCC:
3429 case 0x0CCD: case 0x0E4E:
3433 if (0x02b9 <= i && i <= 0x02c5
3434 || 0x02cc <= i && i <= 0x02d7
3435 || 0x02e4 <= i && i <= 0x02ef
3436 || 0x20DD <= i && i <= 0x20E0
3440 if (0x064B <= i && i <= 0x00652
3441 || 0x0941 <= i && i <= 0x0948
3442 || 0x0AC1 <= i && i <= 0x0ACD
3443 || 0x0C3E <= i && i <= 0x0C4F
3444 || 0x0E31 <= i && i <= 0x0E3F
3448 return Char.GetUnicodeCategory ((char) i) ==
3449 UnicodeCategory.NonSpacingMark;
3452 // We can reuse IsIgnorableSymbol testcode
3453 // for IsIgnorableNonSpacing.
3459 public byte Category;
3461 public byte Level2; // It is always single byte.
3462 public bool Defined;
3464 public CharMapEntry (byte category, byte level1, byte level2)
3466 Category = category;
3475 public readonly int CP;
3476 public readonly int JIS;
3478 public JISCharacter (int cp, int cpJIS)
3485 class JISComparer : IComparer
3487 public static readonly JISComparer Instance =
3490 public int Compare (object o1, object o2)
3492 JISCharacter j1 = (JISCharacter) o1;
3493 JISCharacter j2 = (JISCharacter) o2;
3494 return j1.JIS - j2.JIS;
3498 class NonJISCharacter
3500 public readonly int CP;
3501 public readonly string Name;
3503 public NonJISCharacter (int cp, string name)
3510 class NonJISComparer : IComparer
3512 public static readonly NonJISComparer Instance =
3513 new NonJISComparer ();
3515 public int Compare (object o1, object o2)
3517 NonJISCharacter j1 = (NonJISCharacter) o1;
3518 NonJISCharacter j2 = (NonJISCharacter) o2;
3519 return string.CompareOrdinal (j1.Name, j2.Name);
3523 class DecimalDictionaryValueComparer : IComparer
3525 public static readonly DecimalDictionaryValueComparer Instance
3526 = new DecimalDictionaryValueComparer ();
3528 private DecimalDictionaryValueComparer ()
3532 public int Compare (object o1, object o2)
3534 DictionaryEntry e1 = (DictionaryEntry) o1;
3535 DictionaryEntry e2 = (DictionaryEntry) o2;
3536 // FIXME: in case of 0, compare decomposition categories
3537 int ret = Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
3540 int i1 = (int) e1.Key;
3541 int i2 = (int) e2.Key;
3546 class StringDictionaryValueComparer : IComparer
3548 public static readonly StringDictionaryValueComparer Instance
3549 = new StringDictionaryValueComparer ();
3551 private StringDictionaryValueComparer ()
3555 public int Compare (object o1, object o2)
3557 DictionaryEntry e1 = (DictionaryEntry) o1;
3558 DictionaryEntry e2 = (DictionaryEntry) o2;
3559 int ret = String.Compare ((string) e1.Value, (string) e2.Value);
3562 int i1 = (int) e1.Key;
3563 int i2 = (int) e2.Key;
3568 class UCAComparer : IComparer
3570 public static readonly UCAComparer Instance
3571 = new UCAComparer ();
3573 private UCAComparer ()
3577 public int Compare (object o1, object o2)
3579 char i1 = (char) o1;
3580 char i2 = (char) o2;
3582 int l1 = CollationElementTable.GetSortKeyCount (i1);
3583 int l2 = CollationElementTable.GetSortKeyCount (i2);
3584 int l = l1 > l2 ? l2 : l1;
3586 for (int i = 0; i < l; i++) {
3587 SortKeyValue k1 = CollationElementTable.GetSortKey (i1, i);
3588 SortKeyValue k2 = CollationElementTable.GetSortKey (i2, i);
3589 int v = k1.Primary - k2.Primary;
3592 v = k1.Secondary - k2.Secondary;
3595 v = k1.Thirtiary - k2.Thirtiary;
3598 v = k1.Quarternary - k2.Quarternary;
3611 ArrayList items = new ArrayList ();
3613 public Tailoring (int lcid)
3618 public Tailoring (int lcid, int alias)
3625 get { return lcid; }
3629 get { return alias; }
3632 public bool FrenchSort {
3633 get { return frenchSort; }
3634 set { frenchSort = value; }
3637 public void AddDiacriticalMap (byte target, byte replace)
3639 items.Add (new DiacriticalMap (target, replace));
3642 public void AddSortKeyMap (string source, byte [] sortkey)
3644 items.Add (new SortKeyMap (source, sortkey));
3647 public void AddReplacementMap (string source, string replace)
3649 items.Add (new ReplacementMap (source, replace));
3652 public char [] ItemToCharArray ()
3654 ArrayList al = new ArrayList ();
3655 foreach (ITailoringMap m in items)
3656 al.AddRange (m.ToCharArray ());
3657 return al.ToArray (typeof (char)) as char [];
3660 interface ITailoringMap
3662 char [] ToCharArray ();
3665 class DiacriticalMap : ITailoringMap
3667 public readonly byte Target;
3668 public readonly byte Replace;
3670 public DiacriticalMap (byte target, byte replace)
3676 public char [] ToCharArray ()
3678 char [] ret = new char [3];
3679 ret [0] = (char) 02; // kind:DiacriticalMap
3680 ret [1] = (char) Target;
3681 ret [2] = (char) Replace;
3686 class SortKeyMap : ITailoringMap
3688 public readonly string Source;
3689 public readonly byte [] SortKey;
3691 public SortKeyMap (string source, byte [] sortkey)
3697 public char [] ToCharArray ()
3699 char [] ret = new char [Source.Length + 7];
3700 ret [0] = (char) 01; // kind:SortKeyMap
3701 for (int i = 0; i < Source.Length; i++)
3702 ret [i + 1] = Source [i];
3704 for (int i = 0; i < 4; i++)
3705 ret [i + Source.Length + 2] = (char) SortKey [i];
3710 class ReplacementMap : ITailoringMap
3712 public readonly string Source;
3713 public readonly string Replace;
3715 public ReplacementMap (string source, string replace)
3721 public char [] ToCharArray ()
3723 char [] ret = new char [Source.Length + Replace.Length + 3];
3724 ret [0] = (char) 03; // kind:ReplaceMap
3726 for (int i = 0; i < Source.Length; i++)
3727 ret [pos++] = Source [i];
3730 for (int i = 0; i < Replace.Length; i++)
3731 ret [pos++] = Replace [i];