3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
6 // - CJK, which largely vary depending on LCID (namely kr,jp,zh-CHS,zh-TW)
10 // Also, for composite characters it should prepare different index table.
12 // Except for them, it should use precomputed index array.
16 // * sortkey getter signature
18 // int GetSortKey (string s, int index, byte [] buf)
19 // Stores sort key for corresponding character element into buf and
20 // returns the length of the consumed _source_ character element in s.
22 // * character length to consume; default implementation
24 // If there is a diacritic after the base character, they are consumed
25 // and they are considered as a part of the character element.
29 using System.Collections;
30 using System.Globalization;
32 namespace Mono.Globalization.Unicode
34 internal class MSCompatSortKeyTableGenerator
36 public static void Main ()
38 new MSCompatSortKeyTableGenerator ().Run ();
41 byte [] fillIndex = new byte [255]; // by category
42 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
44 char [] specialIgnore = new char [] {
45 '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
46 '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
49 // FIXME: need more love (as always)
50 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
51 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
52 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
53 '\u0292', '\u01BE', '\u0298'};
54 byte [] alphaWeights = new byte [] {2, 9, 0xA, 0x1A, 0x21,
55 0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70,
56 0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4,
57 0xA6, 0xA9, 0xAA, 0xB3, 0xB4};
64 #region Specially ignored // 01
65 // This will raise "Defined" flag up.
66 foreach (char c in specialIgnore)
67 map [(int) c] = new CharMapEntry (0, 0, 0);
71 #region Variable weights
72 // Controls : 06 03 - 06 3D
74 for (int i = 0; i < 65536; i++) {
76 uc = Char.GetUnicodeCategory (c);
77 if (uc == UnicodeCategory.Control &&
78 !Char.IsWhiteSpace (c))
79 AddCharMap (c, 6, true);
83 map ['\''] = new CharMapEntry (6, 80, 1);
84 map ['\uFF63'] = new CharMapEntry (6, 80, 1); // full
86 // Hyphen/Dash : 06 81 - 06 90
88 for (int i = 0; i < 65536; i++) {
89 if (Char.GetUnicodeCategory ((char) i)
90 == UnicodeCategory.DashPunctuation)
91 AddCharMapGroup ((char) i, 6, true, true);
94 // Arabic variable weight chars 06 A0 -
97 for (int i = 0x64B; i <= 0x650; i++)
98 AddCharMapGroup ((char) i, 6, true, true);
100 AddCharMapGroup ('\u0652', 6, false, true);
102 AddCharMapGroup ('\u0651', 6, false, true);
106 #region Nonspacing marks // 01
107 // FIXME: 01 03 - 01 B6 ... annoyance :(
109 // Combining diacritical marks: 01 DC -
111 // LAMESPEC: It should not stop at '\u20E1'. There are
112 // a few more characters (that however results in
113 // overflow of level 2 unless we start before 0xDD).
114 fillIndex [1] = 0xDC;
115 for (int i = 0x20d0; i <= 0x20e1; i++)
116 AddCharMap ((char) i, 1, true);
120 #region Whitespaces // 07 03 -
122 AddCharMapGroup (' ', 7, false, true);
123 AddCharMap ('\u00A0', 7, true);
124 for (int i = 9; i <= 0xD; i++)
125 AddCharMap ((char) i, 7, true);
126 for (int i = 0x2000; i <= 0x200B; i++)
127 AddCharMap ((char) i, 7, true);
128 AddCharMapGroup ('\u2028', 7, false, true);
129 AddCharMapGroup ('\u2029', 7, false, true);
131 // LAMESPEC: Windows developers seem to have thought
132 // that those characters are kind of whitespaces,
133 // while they aren't.
134 AddCharMapGroup ('\u2422', 7, false, true); // blank symbol
135 AddCharMapGroup ('\u2423', 7, false, true); // open box
139 #region ASCII non-alphanumeric // 07
140 // non-alphanumeric ASCII except for: + - < = > '
141 for (int i = 0x21; i < 0x7F; i++) {
142 if (Char.IsLetterOrDigit ((char) i)
143 || "+-<=>'".IndexOf ((char) i) >= 0)
144 continue; // they are not added here.
145 AddCharMapGroup ((char) i, 7, false, true);
150 // FIXME: for 07 xx we need more love.
153 #region Numbers // 0C 02 - 0C E1
156 // 9F8 : Bengali "one less than the denominator"
157 AddCharMap ('\u09F8', 9, true);
159 ArrayList numbers = new ArrayList ();
160 for (int i = 0; i < 65536; i++)
161 if (Char.IsNumber ((char) i))
164 ArrayList numberValues = new ArrayList ();
165 foreach (int i in numbers)
166 numberValues.Add (new DictionaryEntry (i, CharUnicodeInfo.GetDecimalValue ((char) i)));
167 numberValues.Sort (DictionaryValueComparer.Instance);
168 decimal prevValue = -1;
169 foreach (DictionaryEntry de in numberValues) {
170 decimal currValue = (decimal) de.Value;
171 if (prevValue < currValue) {
172 prevValue = currValue;
175 AddCharMap ((char) ((int) de.Key), 9, false);
179 fillIndex [9] = 0xFF;
180 AddCharMap ('\u221E', 9, true);
184 #region Latin alphabets
185 for (int i = 0; i < alphabets.Length; i++) {
186 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
193 fillIndex [0xF] = 02;
194 for (int i = 0x0380; i < 0x03CF; i++)
195 if (Char.IsLetter ((char) i))
196 AddLetterMap ((char) i, 0xF, true);
197 fillIndex [0xF] = 0x40;
198 for (int i = 0x03D0; i < 0x0400; i++)
199 if (Char.IsLetter ((char) i))
200 AddLetterMap ((char) i, 0xF, true);
202 // Cyrillic - UCA order w/ some modification
203 fillIndex [0x10] = 0x3;
204 // FIXME: For \u0400-\u045F we need "ordered Cyrillic"
205 // table which is moslty from UCA DUCET.
206 for (int i = 0; i < orderedCyrillic.Length; i++) {
207 char c = orderedCyrillic [i];
208 if (Char.IsLetter (c)) {
209 AddLetterMap (c, 0x10, false);
210 fillIndex [0x10] += 3;
213 for (int i = 0x0460; i < 0x0481; i++) {
214 if (Char.IsLetter ((char) i)) {
215 AddLetterMap ((char) i, 0x10, false);
216 fillIndex [0x10] += 3;
221 fillIndex [0x11] = 0x3;
222 for (int i = 0x0531; i < 0x0586; i++)
223 if (Char.IsLetter ((char) i))
224 AddLetterMap ((char) i, 0x11, true);
227 fillIndex [0x12] = 0x3;
228 for (int i = 0x05D0; i < 0x05FF; i++)
229 if (Char.IsLetter ((char) i))
230 AddLetterMap ((char) i, 0x12, true);
233 fillIndex [0x13] = 0x3;
235 FIXME: I still need more love on presentation form B
237 fillIndex [0x13] = 0x84;
238 for (int i = 0x0674; i < 0x06D6; i++)
239 if (Char.IsLetter ((char) i))
240 AddLetterMap ((char) i, 0x13, true);
243 for (int i = 0x0901; i < 0x0905; i++) {
244 if (Char.IsLetter ((char) i)) {
245 AddLetterMap ((char) i, 0x14, false);
246 fillIndex [0x14] += 2;
249 for (int i = 0x0905; i < 0x093A; i++) {
250 if (Char.IsLetter ((char) i)) {
251 AddLetterMap ((char) i, 0x14, false);
252 fillIndex [0x14] += 4;
255 for (int i = 0x093E; i < 0x094F; i++) {
256 if (Char.IsLetter ((char) i)) {
257 AddLetterMap ((char) i, 0x14, false);
258 fillIndex [0x14] += 2;
263 fillIndex [0x15] = 02;
264 for (int i = 0x0980; i < 0x9FF; i++) {
266 fillIndex [0x15] = 0x3B;
267 switch (Char.GetUnicodeCategory ((char) i)) {
269 case DecimalDigitNumber:
273 AddLetterMap ((char) i, 0x15, true);
277 fillIndex [0x16] = 02;
278 // FIXME: orderedGurmukhi needed from UCA
279 for (int i = 0; i < orderedGurmukhi.Length; i++) {
280 char c = orderedGurmukhi [i];
281 if (c == '\u0A3C' || c == '\u0A4D' ||
282 '\u0A66' <= c && c <= '\u0A71')
284 AddLetterMap (c, 0x16, false);
285 fillIndex [0x16] += 4;
289 fillIndex [0x17] = 02;
290 // FIXME: orderedGujarati needed from UCA
291 for (int i = 0; i < orderedGujarati.Length; i++) {
292 char c = orderedGujarati [i];
293 AddLetterMap (c, 0x17, false);
294 fillIndex [0x17] += 4;
298 fillIndex [0x18] = 02;
299 for (int i = 0x0B00; i < 0x0B7F; i++) {
300 switch (Char.GetUnicodeCategory ((char) i)) {
302 case DecimalDigitNumber:
305 AddLetterMap ((char) i, 0x18, true);
309 fillIndex [0x19] = 2;
310 AddCharMap ('\u0BD7', 0x19, false);
311 fillIndex [0x19] = 0xA;
313 for (int i = 0x0BD7; i < 0x0B94; i++) {
314 if (Char.IsLetter ((char) i) {
315 AddCharMap ((char) i, 0x19, false);
316 fillIndex [0x19] += 2;
320 fillIndex [0x19] = 0x24;
321 AddCharMap ('\u0B94', 0x19, false);
322 fillIndex [0x19] = 0x26;
323 // FIXME: we need to have constant array for Tamil
324 // consonants. Windows have almost similar sequence
325 // to TAM from tamilnet but a bit different in Grantha
326 for (int i = 0; i < orderedTamil.Length; i++) {
327 char c = orderedGujarati [i];
328 AddLetterMap (c, 0x19, false);
329 fillIndex [0x19] += 4;
333 fillIndex [0x1A] = 0x4;
334 for (int i = 0x0C00; i < 0x0C62; i++) {
335 if (i == 0x0C55 || i == 0x0C56)
337 AddCharMap ((char) i, 0x1A, false);
338 fillIndex [0x1A] += 3;
339 char supp = (i == 0x0C0B) ? '\u0C60':
340 i == 0x0C0C ? '\u0C61' : char.MinValue;
341 if (supp == char.MinValue)
343 AddCharMap (supp, 0x1A, false);
344 fillIndex [0x1A] += 3;
348 fillIndex [0x1B] = 4;
349 for (int i = 0x0C80; i < 0x0CE5; i++) {
350 if (i == 0x0CD5 || i == 0x0CD6)
352 AddCharMap ((char) i, 0x1B, false);
353 fillIndex [0x1B] += 3;
357 fillIndex [0x1C] = 2;
358 for (int i = 0x0D02; i < 0x0D61; i++)
359 if (!IsIgnorable ((char) i))
360 AddCharMap ((char) i, 0x1C, true);
362 // Thai ... note that it breaks 0x1E wall after E2B!
363 // Also, all Thai characters have level 2 value 3.
364 fillIndex [0x1E] = 2;
365 for (int i = 0xE44; i < 0xE48; i++)
366 AddThaiCharMap ((char) i, 0x1E, true);
367 for (int i = 0xE01; i < 0xE2B; i++) {
368 AddThaiCharMap ((char) i, 0x1E, false);
369 fillIndex [0x1E] += 6;
371 fillIndex [0x1F] = 5;
372 for (int i = 0xE2B; i < 0xE30; i++) {
373 AddThaiCharMap ((char) i, 0x1F, false);
374 fillIndex [0x1F] += 6;
376 for (int i = 0xE30; i < 0xE3B; i++)
377 AddThaiCharMap ((char) i, 0x1F, true);
378 // some Thai characters remains.
379 char [] specialThai = new char [] {'\u0E45', '\u0E46',
380 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
381 foreach (char c in specialThai)
382 AddThaiCharMap (c, 0x1F, true);
385 fillIndex [0x1F] = 2;
386 for (int i = 0xE80; i < 0xEDF; i++)
387 if (Char.IsLetter ((char) i))
388 AddCharMap ((char) i, 0x1F, true);
391 // FIXME: we need an array in UCA order.
392 fillIndex [0x21] = 5;
393 for (int i = 0; i < orderedGeorgian.Length; i++) {
394 char c = orderedGeorgian [i];
395 AddLetterMap (c, 0x21, false);
396 fillIndex [0x21] += 5;
402 private void AddAlphaMap (char c, byte category, byte alphaWeight)
404 throw new NotImplementedException ();
407 class DictionaryValueComparer : IComparer
409 public static readonly DictionaryValueComparer Instance
410 = new DictionaryValueComparer ();
412 private DictionaryValueComparer ()
416 public /*static*/ int Compare (object o1, object o2)
418 DictionaryEntry e1 = (DictionaryEntry) o1;
419 DictionaryEntry e2 = (DictionaryEntry) o2;
420 // FIXME: in case of 0, compare decomposition categories
421 return Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
425 private void AddCharMapGroup (char c, byte category, bool tail, bool updateIndexForSelf)
427 // <small> update index
429 MSCompatGenerated.ToSmallFormTail (c) :
430 MSCompatGenerated.ToSmallForm (c);
431 if (c2 > char.MinValue)
432 AddCharMap (c2, category, true);
434 AddCharMap (c, category, updateIndexForSelf);
437 MSCompatGenerated.ToFullWidthTail (c) :
438 MSCompatGenerated.ToFullWidth (c);
439 if (c2 > char.MinValue)
440 AddCharMapGroup (c2, category, tail, false);
443 private void AddCharMap (char c, byte category, bool increment)
445 map [(int) c] = new CharMapEntry (category,
446 category == 1 ? (byte) 1 : fillIndex [category],
447 category != 1 ? fillIndex [category] : (byte) 1);
449 fillIndex [category] += 1;
452 #region Level 3 properties (Case/Width)
454 public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
457 if ('\u1100' <= c && c <= '\u11F9)
459 if ('\uFFA0' <= c && c <= '\uFFDC)
461 if ('\u3130' <= c && c <= '\u3164)
464 if ('\u2776' <= c && c <= '\u277F')
466 if ('\u2780' <= c && c <= '\u2789')
468 if ('\u2776' <= c && c <= '\u2793')
470 if ('\u2160' <= c && c <= '\u216F')
472 if ('\u2181' <= c && c <= '\u2182')
475 if ('\u2135' <= c && c <= '\u2138')
477 if ('\uFE80' <= c && c <= '\uFE8E')
478 return MSCompatGenerated.GetArabicFormInPresentationB (c);
480 // actually I dunno the reason why they have weights.
503 switch (MSCompatGenerated.GetNormalizationType (c)) {
514 if (MSCompatGenerated.IsSmallCapital (c)) // grep "SMALL CAPITAL"
516 if (MSCompatGenerated.IsUppercase (c)) // DerivedCoreProperties
522 // TODO: implement GetArabicFormInRepresentationD(),
523 // GetNormalizationType(), IsSmallCapital() and IsUppercase().
524 // (They can be easily to be generated.)
530 internal struct CharMapEntry
532 public readonly byte Category;
533 public readonly byte Level1;
534 public readonly byte Level2; // It is always single byte.
535 public readonly bool Defined;
537 public CharMapEntry (byte category, byte level1, byte level2)