2005-05-26 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-mscompat-collation-table.cs
1 //
2 //
3 // There are two kind of sort keys : which are computed and which are laid out
4 // as an indexed array. Computed sort keys are:
5 //
6 //      - CJK, which largely vary depending on LCID (namely kr,jp,zh-CHS,zh-TW)
7 //      - Surrogate
8 //      - PrivateUse
9 //
10 // Also, for composite characters it should prepare different index table.
11 //
12 // Except for them, it should use precomputed index array.
13 //
14
15 //
16 // * sortkey getter signature
17 //
18 //      int GetSortKey (string s, int index, byte [] buf)
19 //      Stores sort key for corresponding character element into buf and
20 //      returns the length of the consumed _source_ character element in s.
21 //
22 // * character length to consume; default implementation
23 //
24 //      If there is a diacritic after the base character, they are consumed
25 //      and they are considered as a part of the character element.
26 //
27
28 using System;
29 using System.Collections;
30 using System.Globalization;
31
32 namespace Mono.Globalization.Unicode
33 {
34         internal class MSCompatSortKeyTableGenerator
35         {
36                 public static void Main ()
37                 {
38                         new MSCompatSortKeyTableGenerator ().Run ();
39                 }
40
41                 byte [] fillIndex = new byte [255]; // by category
42                 CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
43
44                 char [] specialIgnore = new char [] {
45                         '\u3099', '\u309A', '\u309B', '\u309C', '\u0BCD',
46                         '\u0E47', '\u0E4C', '\uFF9E', '\uFF9F'
47                         };
48
49                 // FIXME: need more love (as always)
50                 char [] alphabets = new char [] {'A', 'B', 'C', 'D', 'E', 'F',
51                         'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
52                         'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
53                         '\u0292', '\u01BE', '\u0298'};
54                 byte [] alphaWeights = new byte [] {2, 9, 0xA, 0x1A, 0x21,
55                         0x23, 0x25, 0x2C, 0x32, 0x35, 0x36, 0x48, 0x51, 0x70,
56                         0x7C, 0x7E, 0x89, 0x8A, 0x91, 0x99, 0x9F, 0xA2, 0xA4,
57                         0xA6, 0xA9, 0xAA, 0xB3, 0xB4};
58
59
60                 public void Run ()
61                 {
62                         UnicodeCategory uc;
63
64                         #region Specially ignored // 01
65                         // This will raise "Defined" flag up.
66                         foreach (char c in specialIgnore)
67                                 map [(int) c] = new CharMapEntry (0, 0, 0);
68                         #endregion
69
70
71                         #region Variable weights
72                         // Controls : 06 03 - 06 3D
73                         fillIndex [6] = 3;
74                         for (int i = 0; i < 65536; i++) {
75                                 char c = (char) i;
76                                 uc = Char.GetUnicodeCategory (c);
77                                 if (uc == UnicodeCategory.Control &&
78                                         !Char.IsWhiteSpace (c))
79                                         AddCharMap (c, 6, true);
80                         }
81
82                         // Apostrophe 06 80
83                         map ['\''] = new CharMapEntry (6, 80, 1);
84                         map ['\uFF63'] = new CharMapEntry (6, 80, 1); // full
85
86                         // Hyphen/Dash : 06 81 - 06 90
87                         fillIndex [6] = 0x81;
88                         for (int i = 0; i < 65536; i++) {
89                                 if (Char.GetUnicodeCategory ((char) i)
90                                         == UnicodeCategory.DashPunctuation)
91                                         AddCharMapGroup ((char) i, 6, true, true);
92                         }
93
94                         // Arabic variable weight chars 06 A0 -
95                         fillIndex [6] = 0xA0;
96                         // vowels
97                         for (int i = 0x64B; i <= 0x650; i++)
98                                 AddCharMapGroup ((char) i, 6, true, true);
99                         // sukun
100                         AddCharMapGroup ('\u0652', 6, false, true);
101                         // shadda
102                         AddCharMapGroup ('\u0651', 6, false, true);
103                         #endregion
104
105
106                         #region Nonspacing marks // 01
107                         // FIXME: 01 03 - 01 B6 ... annoyance :(
108
109                         // Combining diacritical marks: 01 DC -
110
111                         // LAMESPEC: It should not stop at '\u20E1'. There are
112                         // a few more characters (that however results in 
113                         // overflow of level 2 unless we start before 0xDD).
114                         fillIndex [1] = 0xDC;
115                         for (int i = 0x20d0; i <= 0x20e1; i++)
116                                 AddCharMap ((char) i, 1, true);
117                         #endregion
118
119
120                         #region Whitespaces // 07 03 -
121                         fillIndex [7] = 0x3;
122                         AddCharMapGroup (' ', 7, false, true);
123                         AddCharMap ('\u00A0', 7, true);
124                         for (int i = 9; i <= 0xD; i++)
125                                 AddCharMap ((char) i, 7, true);
126                         for (int i = 0x2000; i <= 0x200B; i++)
127                                 AddCharMap ((char) i, 7, true);
128                         AddCharMapGroup ('\u2028', 7, false, true);
129                         AddCharMapGroup ('\u2029', 7, false, true);
130
131                         // LAMESPEC: Windows developers seem to have thought 
132                         // that those characters are kind of whitespaces,
133                         // while they aren't.
134                         AddCharMapGroup ('\u2422', 7, false, true); // blank symbol
135                         AddCharMapGroup ('\u2423', 7, false, true); // open box
136                         #endregion
137
138
139                         #region ASCII non-alphanumeric // 07
140                         // non-alphanumeric ASCII except for: + - < = > '
141                         for (int i = 0x21; i < 0x7F; i++) {
142                                 if (Char.IsLetterOrDigit ((char) i)
143                                         || "+-<=>'".IndexOf ((char) i) >= 0)
144                                         continue; // they are not added here.
145                                 AddCharMapGroup ((char) i, 7, false, true);
146                         }
147                         #endregion
148
149
150                         // FIXME: for 07 xx we need more love.
151
152
153                         #region Numbers // 0C 02 - 0C E1
154                         fillIndex [9] = 2;
155
156                         // 9F8 : Bengali "one less than the denominator"
157                         AddCharMap ('\u09F8', 9, true);
158
159                         ArrayList numbers = new ArrayList ();
160                         for (int i = 0; i < 65536; i++)
161                                 if (Char.IsNumber ((char) i))
162                                         numbers.Add (i);
163
164                         ArrayList numberValues = new ArrayList ();
165                         foreach (int i in numbers)
166                                 numberValues.Add (new DictionaryEntry (i, CharUnicodeInfo.GetDecimalValue ((char) i)));
167                         numberValues.Sort (DictionaryValueComparer.Instance);
168                         decimal prevValue = -1;
169                         foreach (DictionaryEntry de in numberValues) {
170                                 decimal currValue = (decimal) de.Value;
171                                 if (prevValue < currValue) {
172                                         prevValue = currValue;
173                                         fillIndex [9] += 1;
174                                 }
175                                 AddCharMap ((char) ((int) de.Key), 9, false);
176                         }
177
178                         // 221E: infinity
179                         fillIndex [9] = 0xFF;
180                         AddCharMap ('\u221E', 9, true);
181                         #endregion
182
183
184                         #region Latin alphabets
185                         for (int i = 0; i < alphabets.Length; i++) {
186                                 AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]);
187                         }
188                         #endregion
189
190                         #region Letters
191
192                         // Greek and Coptic
193                         fillIndex [0xF] = 02;
194                         for (int i = 0x0380; i < 0x03CF; i++)
195                                 if (Char.IsLetter ((char) i))
196                                         AddLetterMap ((char) i, 0xF, true);
197                         fillIndex [0xF] = 0x40;
198                         for (int i = 0x03D0; i < 0x0400; i++)
199                                 if (Char.IsLetter ((char) i))
200                                         AddLetterMap ((char) i, 0xF, true);
201
202                         // Cyrillic - UCA order w/ some modification
203                         fillIndex [0x10] = 0x3;
204                         // FIXME: For \u0400-\u045F we need "ordered Cyrillic"
205                         // table which is moslty from UCA DUCET.
206                         for (int i = 0; i < orderedCyrillic.Length; i++) {
207                                 char c = orderedCyrillic [i];
208                                 if (Char.IsLetter (c)) {
209                                         AddLetterMap (c, 0x10, false);
210                                         fillIndex [0x10] += 3;
211                                 }
212                         }
213                         for (int i = 0x0460; i < 0x0481; i++) {
214                                 if (Char.IsLetter ((char) i)) {
215                                         AddLetterMap ((char) i, 0x10, false);
216                                         fillIndex [0x10] += 3;
217                                 }
218                         }
219
220                         // Armenian
221                         fillIndex [0x11] = 0x3;
222                         for (int i = 0x0531; i < 0x0586; i++)
223                                 if (Char.IsLetter ((char) i))
224                                         AddLetterMap ((char) i, 0x11, true);
225
226                         // Hebrew
227                         fillIndex [0x12] = 0x3;
228                         for (int i = 0x05D0; i < 0x05FF; i++)
229                                 if (Char.IsLetter ((char) i))
230                                         AddLetterMap ((char) i, 0x12, true);
231
232                         // Arabic
233                         fillIndex [0x13] = 0x3;
234                         /*
235                         FIXME: I still need more love on presentation form B
236                         */
237                         fillIndex [0x13] = 0x84;
238                         for (int i = 0x0674; i < 0x06D6; i++)
239                                 if (Char.IsLetter ((char) i))
240                                         AddLetterMap ((char) i, 0x13, true);
241
242                         // Devanagari
243                         for (int i = 0x0901; i < 0x0905; i++) {
244                                 if (Char.IsLetter ((char) i)) {
245                                         AddLetterMap ((char) i, 0x14, false);
246                                         fillIndex [0x14] += 2;
247                                 }
248                         }
249                         for (int i = 0x0905; i < 0x093A; i++) {
250                                 if (Char.IsLetter ((char) i)) {
251                                         AddLetterMap ((char) i, 0x14, false);
252                                         fillIndex [0x14] += 4;
253                                 }
254                         }
255                         for (int i = 0x093E; i < 0x094F; i++) {
256                                 if (Char.IsLetter ((char) i)) {
257                                         AddLetterMap ((char) i, 0x14, false);
258                                         fillIndex [0x14] += 2;
259                                 }
260                         }
261
262                         // Bengali
263                         fillIndex [0x15] = 02;
264                         for (int i = 0x0980; i < 0x9FF; i++) {
265                                 if (i == 0x09E0)
266                                         fillIndex [0x15] = 0x3B;
267                                 switch (Char.GetUnicodeCategory ((char) i)) {
268                                 case NonSpacingMark:
269                                 case DecimalDigitNumber:
270                                 case OtherNumber:
271                                         continue;
272                                 }
273                                 AddLetterMap ((char) i, 0x15, true);
274                         }
275
276                         // Gurmukhi
277                         fillIndex [0x16] = 02;
278                         // FIXME: orderedGurmukhi needed from UCA
279                         for (int i = 0; i < orderedGurmukhi.Length; i++) {
280                                 char c = orderedGurmukhi [i];
281                                 if (c == '\u0A3C' || c == '\u0A4D' ||
282                                         '\u0A66' <= c && c <= '\u0A71')
283                                         continue;
284                                 AddLetterMap (c, 0x16, false);
285                                 fillIndex [0x16] += 4;
286                         }
287
288                         // Gujarati
289                         fillIndex [0x17] = 02;
290                         // FIXME: orderedGujarati needed from UCA
291                         for (int i = 0; i < orderedGujarati.Length; i++) {
292                                 char c = orderedGujarati [i];
293                                 AddLetterMap (c, 0x17, false);
294                                 fillIndex [0x17] += 4;
295                         }
296
297                         // Oriya
298                         fillIndex [0x18] = 02;
299                         for (int i = 0x0B00; i < 0x0B7F; i++) {
300                                 switch (Char.GetUnicodeCategory ((char) i)) {
301                                 case NonSpacingMark:
302                                 case DecimalDigitNumber:
303                                         continue;
304                                 }
305                                 AddLetterMap ((char) i, 0x18, true);
306                         }
307
308                         // Tamil
309                         fillIndex [0x19] = 2;
310                         AddCharMap ('\u0BD7', 0x19, false);
311                         fillIndex [0x19] = 0xA;
312                         // vowels
313                         for (int i = 0x0BD7; i < 0x0B94; i++) {
314                                 if (Char.IsLetter ((char) i) {
315                                         AddCharMap ((char) i, 0x19, false);
316                                         fillIndex [0x19] += 2;
317                                 }
318                         }
319                         // special vowel
320                         fillIndex [0x19] = 0x24;
321                         AddCharMap ('\u0B94', 0x19, false);
322                         fillIndex [0x19] = 0x26;
323                         // FIXME: we need to have constant array for Tamil
324                         // consonants. Windows have almost similar sequence
325                         // to TAM from tamilnet but a bit different in Grantha
326                         for (int i = 0; i < orderedTamil.Length; i++) {
327                                 char c = orderedGujarati [i];
328                                 AddLetterMap (c, 0x19, false);
329                                 fillIndex [0x19] += 4;
330                         }
331
332                         // Telugu
333                         fillIndex [0x1A] = 0x4;
334                         for (int i = 0x0C00; i < 0x0C62; i++) {
335                                 if (i == 0x0C55 || i == 0x0C56)
336                                         continue; // skip
337                                 AddCharMap ((char) i, 0x1A, false);
338                                 fillIndex [0x1A] += 3;
339                                 char supp = (i == 0x0C0B) ? '\u0C60':
340                                         i == 0x0C0C ? '\u0C61' : char.MinValue;
341                                 if (supp == char.MinValue)
342                                         continue;
343                                 AddCharMap (supp, 0x1A, false);
344                                 fillIndex [0x1A] += 3;
345                         }
346
347                         // Kannada
348                         fillIndex [0x1B] = 4;
349                         for (int i = 0x0C80; i < 0x0CE5; i++) {
350                                 if (i == 0x0CD5 || i == 0x0CD6)
351                                         continue; // ignore
352                                 AddCharMap ((char) i, 0x1B, false);
353                                 fillIndex [0x1B] += 3;
354                         }
355                         
356                         // Malayalam
357                         fillIndex [0x1C] = 2;
358                         for (int i = 0x0D02; i < 0x0D61; i++)
359                                 if (!IsIgnorable ((char) i))
360                                         AddCharMap ((char) i, 0x1C, true);
361
362                         // Thai ... note that it breaks 0x1E wall after E2B!
363                         // Also, all Thai characters have level 2 value 3.
364                         fillIndex [0x1E] = 2;
365                         for (int i = 0xE44; i < 0xE48; i++)
366                                 AddThaiCharMap ((char) i, 0x1E, true);
367                         for (int i = 0xE01; i < 0xE2B; i++) {
368                                 AddThaiCharMap ((char) i, 0x1E, false);
369                                 fillIndex [0x1E] += 6;
370                         }
371                         fillIndex [0x1F] = 5;
372                         for (int i = 0xE2B; i < 0xE30; i++) {
373                                 AddThaiCharMap ((char) i, 0x1F, false);
374                                 fillIndex [0x1F] += 6;
375                         }
376                         for (int i = 0xE30; i < 0xE3B; i++)
377                                 AddThaiCharMap ((char) i, 0x1F, true);
378                         // some Thai characters remains.
379                         char [] specialThai = new char [] {'\u0E45', '\u0E46',
380                                 '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'};
381                         foreach (char c in specialThai)
382                                 AddThaiCharMap (c, 0x1F, true);
383
384                         // Lao
385                         fillIndex [0x1F] = 2;
386                         for (int i = 0xE80; i < 0xEDF; i++)
387                                 if (Char.IsLetter ((char) i))
388                                         AddCharMap ((char) i, 0x1F, true);
389
390                         // Georgian
391                         // FIXME: we need an array in UCA order.
392                         fillIndex [0x21] = 5;
393                         for (int i = 0; i < orderedGeorgian.Length; i++) {
394                                 char c = orderedGeorgian [i];
395                                 AddLetterMap (c, 0x21, false);
396                                 fillIndex [0x21] += 5;
397                         }
398
399                         #endregion
400                 }
401
402                 private void AddAlphaMap (char c, byte category, byte alphaWeight)
403                 {
404                         throw new NotImplementedException ();
405                 }
406
407                 class DictionaryValueComparer : IComparer
408                 {
409                         public static readonly DictionaryValueComparer Instance
410                                 = new DictionaryValueComparer ();
411
412                         private DictionaryValueComparer ()
413                         {
414                         }
415
416                         public /*static*/ int Compare (object o1, object o2)
417                         {
418                                 DictionaryEntry e1 = (DictionaryEntry) o1;
419                                 DictionaryEntry e2 = (DictionaryEntry) o2;
420                                 // FIXME: in case of 0, compare decomposition categories
421                                 return Decimal.Compare ((decimal) e1.Value, (decimal) e2.Value);
422                         }
423                 }
424
425                 private void AddCharMapGroup (char c, byte category, bool tail, bool updateIndexForSelf)
426                 {
427                         // <small> update index
428                         char c2 = tail ?
429                                 MSCompatGenerated.ToSmallFormTail (c) :
430                                 MSCompatGenerated.ToSmallForm (c);
431                         if (c2 > char.MinValue)
432                                 AddCharMap (c2, category, true);
433                         // itself
434                         AddCharMap (c, category, updateIndexForSelf);
435                         // <full>
436                         c2 = tail ?
437                                 MSCompatGenerated.ToFullWidthTail (c) :
438                                 MSCompatGenerated.ToFullWidth (c);
439                         if (c2 > char.MinValue)
440                                 AddCharMapGroup (c2, category, tail, false);
441                 }
442
443                 private void AddCharMap (char c, byte category, bool increment)
444                 {
445                         map [(int) c] = new CharMapEntry (category,
446                                 category == 1 ? (byte) 1 : fillIndex [category],
447                                 category != 1 ? fillIndex [category] : (byte) 1);
448                         if (increment)
449                                 fillIndex [category] += 1;
450                 }
451
452                 #region Level 3 properties (Case/Width)
453
454                 public static byte GetLevel3WeightRaw (char c) // add 2 for sortkey value
455                 {
456                         // Korean
457                         if ('\u1100' <= c && c <= '\u11F9)
458                                 return 2;
459                         if ('\uFFA0' <= c && c <= '\uFFDC)
460                                 return 4;
461                         if ('\u3130' <= c && c <= '\u3164)
462                                 return 5;
463                         // numbers
464                         if ('\u2776' <= c && c <= '\u277F')
465                                 return 4;
466                         if ('\u2780' <= c && c <= '\u2789')
467                                 return 8;
468                         if ('\u2776' <= c && c <= '\u2793')
469                                 return 0xC;
470                         if ('\u2160' <= c && c <= '\u216F')
471                                 return 0x18;
472                         if ('\u2181' <= c && c <= '\u2182')
473                                 return 0x18;
474                         // Arabic
475                         if ('\u2135' <= c && c <= '\u2138')
476                                 return 4;
477                         if ('\uFE80' <= c && c <= '\uFE8E')
478                                 return MSCompatGenerated.GetArabicFormInPresentationB (c);
479
480                         // actually I dunno the reason why they have weights.
481                         switch (c) {
482                         case '\u01BC':
483                                 return 0x10;
484                         case '\u06A9':
485                                 return 0x20;
486                         case '\u06AA':
487                                 return 0x28;
488                         }
489
490                         byte ret = 0;
491                         switch (c) {
492                         case '\u03C2':
493                         case '\u2104':
494                         case '\u212B':
495                                 ret |= 8;
496                                 break;
497                         case '\uFE42':
498                                 ret |= 0xC;
499                                 break;
500                         }
501
502                         // misc
503                         switch (MSCompatGenerated.GetNormalizationType (c)) {
504                         case 1: // <full>
505                                 ret |= 1;
506                                 break;
507                         case 2: // <sub>
508                                 ret |= 2;
509                                 break;
510                         case 3: // <super>
511                                 ret |= 0xE;
512                                 break;
513                         }
514                         if (MSCompatGenerated.IsSmallCapital (c)) // grep "SMALL CAPITAL"
515                                 ret |= 8;
516                         if (MSCompatGenerated.IsUppercase (c)) // DerivedCoreProperties
517                                 ret |= 0x10;
518
519                         return ret;
520                 }
521
522                 // TODO: implement GetArabicFormInRepresentationD(),
523                 // GetNormalizationType(), IsSmallCapital() and IsUppercase().
524                 // (They can be easily to be generated.)
525
526                 #endregion
527
528         }
529
530         internal struct CharMapEntry
531         {
532                 public readonly byte Category;
533                 public readonly byte Level1;
534                 public readonly byte Level2; // It is always single byte.
535                 public readonly bool Defined;
536
537                 public CharMapEntry (byte category, byte level1, byte level2)
538                 {
539                         Category = category;
540                         Level1 = level1;
541                         Level2 = level2;
542                         Defined = true;
543                 }
544         }
545 }