[runtime] Updates comments.
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace Mono.Globalization.Unicode
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static byte GetCombiningClass (int c)
42                 {
43                         return combiningClass [NUtil.Combining.ToIndex (c)];
44                 }
45
46                 static int GetPrimaryCompositeFromMapIndex (int src)
47                 {
48                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
49                 }
50
51                 static int GetPrimaryCompositeHelperIndex (int cp)
52                 {
53                         return helperIndex [NUtil.Helper.ToIndex (cp)];
54                 }
55
56                 private static string Compose (string source, int checkType)
57                 {
58                         StringBuilder sb = null;
59                         // Decompose to NFD or NKFD depending on our target
60                         Decompose (source, ref sb, checkType == 2 ? 3 : 1);
61                         if (sb == null)
62                                 sb = Combine (source, 0, checkType);
63                         else
64                                 Combine (sb, 0, checkType);
65
66                         return sb != null ? sb.ToString () : source;
67                 }
68
69                 private static StringBuilder Combine (string source, int start, int checkType)
70                 {
71                         for (int i = 0; i < source.Length; i++) {
72                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
73                                         continue;
74                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
75                                 sb.Append (source);
76                                 Combine (sb, i, checkType);
77                                 return sb;
78                         }
79                         return null;
80                 }
81
82 /*
83                 private static bool CanBePrimaryComposite (int i)
84                 {
85                         if (i >= 0x3400 && i <= 0x9FBB)
86                                 return GetPrimaryCompositeHelperIndex (i) != 0;
87                         return (PropValue (i) & IsUnsafe) != 0;
88                 }
89 */
90                 private static void Combine (StringBuilder sb, int i, int checkType)
91                 {
92                         // Back off one character as we may be looking at a V or T jamo.
93                         CombineHangul (sb, null, i > 0 ? i - 1 : i);
94
95                         while (i < sb.Length) {
96                                 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
97                                         i++;
98                                         continue;
99                                 }
100
101                                 i = TryComposeWithPreviousStarter (sb, null, i);
102                         }
103                 }
104
105                 private static int CombineHangul (StringBuilder sb, string s, int current)
106                 {
107                         int length = sb != null ? sb.Length : s.Length;
108                         int last = Fetch (sb, s, current);
109
110                         for (int i = current + 1; i < length; ++i) {
111                                 int ch = Fetch (sb, s, i);
112
113                                 // 1. check to see if two current characters are L and V
114
115                                 int LIndex = last - HangulLBase;
116                                 if (0 <= LIndex && LIndex < HangulLCount) {
117                                         int VIndex = ch - HangulVBase;
118                                         if (0 <= VIndex && VIndex < HangulVCount) {
119                                                 if (sb == null)
120                                                         return -1;
121
122                                                 // make syllable of form LV
123
124                                                 last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
125
126                                                 sb [i - 1] = (char) last; // reset last
127                                                 sb.Remove (i, 1);
128                                                 i--; length--;
129                                                 continue; // discard ch
130                                         }
131                                 }
132
133
134                                 // 2. check to see if two current characters are LV and T
135
136                                 int SIndex = last - HangulSBase;
137                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
138                                         int TIndex = ch - HangulTBase;
139                                         if (0 < TIndex && TIndex < HangulTCount) {
140                                                 if (sb == null)
141                                                         return -1;
142
143                                                 // make syllable of form LVT
144
145                                                 last += TIndex;
146
147                                                 sb [i - 1] = (char) last; // reset last
148                                                 sb.Remove (i, 1);
149                                                 i--; length--;
150                                                 continue; // discard ch
151                                         }
152                                 }
153                                 // if neither case was true, just add the character
154                                 last = ch;
155                         }
156
157                         return length;
158                 }
159
160                 static int Fetch (StringBuilder sb, string s, int i)
161                 {
162                         return (int) (sb != null ? sb [i] : s [i]);
163                 }
164
165                 // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
166                 static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
167                 {
168                         // Backtrack to previous starter.
169                         int i = current - 1;
170                         if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
171                                 if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
172                                         return current + 1;
173                         } else {
174                                 while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
175                                         i--;
176                                 if (i < 0)
177                                         return current + 1;
178                         }
179
180                         int starter = Fetch (sb, s, i);
181
182                         // The various decompositions involving starter follow this index.
183                         int comp_idx = GetPrimaryCompositeHelperIndex (starter);
184                         if (comp_idx == 0)
185                                 return current + 1;
186
187                         int length = (sb != null ? sb.Length : s.Length);
188                         int prevCombiningClass = -1;
189                         for (int j = i + 1; j < length; j++) {
190                                 int candidate = Fetch (sb, s, j);
191
192                                 int combiningClass = GetCombiningClass (candidate);
193                                 if (combiningClass == prevCombiningClass)
194                                         // We skipped over a guy with the same class, without
195                                         // combining.  Skip this one, too.
196                                         continue;
197
198                                 int composed = TryCompose (comp_idx, starter, candidate);
199                                 if (composed != 0) {
200                                         if (sb == null)
201                                                 // Not normalized, and we are only checking.
202                                                 return -1;
203
204                                         // Full Unicode warning: This will break when the underlying
205                                         // tables are extended.
206                                         sb [i] = (char) composed;
207                                         sb.Remove (j, 1);
208
209                                         return current;
210                                 }
211
212                                 // Gray box.  We're done.
213                                 if (combiningClass == 0)
214                                         return j + 1;
215
216                                 prevCombiningClass = combiningClass;
217                         }
218
219                         return length;
220                 }
221
222                 static int TryCompose (int i, int starter, int candidate)
223                 {
224                         while (mappedChars [i] == starter) {
225                                 if (mappedChars [i + 1] == candidate &&
226                                     mappedChars [i + 2] == 0) {
227                                         int composed = GetPrimaryCompositeFromMapIndex (i);
228
229                                         if ((PropValue (composed) & FullCompositionExclusion) == 0)
230                                                 return composed;
231                                 }
232
233                                 // Skip this entry.
234                                 while (mappedChars [i] != 0)
235                                         i++;
236                                 i++;
237                         }
238
239                         return 0;
240                 }
241
242                 static string Decompose (string source, int checkType)
243                 {
244                         StringBuilder sb = null;
245                         Decompose (source, ref sb, checkType);
246                         return sb != null ? sb.ToString () : source;
247                 }
248
249                 static void Decompose (string source,
250                         ref StringBuilder sb, int checkType)
251                 {
252                         int [] buf = null;
253                         int start = 0;
254                         for (int i = 0; i < source.Length; i++)
255                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
256                                         DecomposeChar (ref sb, ref buf, source,
257                                                 i, checkType, ref start);
258                         if (sb != null)
259                                 sb.Append (source, start, source.Length - start);
260                         ReorderCanonical (source, ref sb, 1);
261                 }
262
263                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
264                 {
265                         if (sb == null) {
266                                 // check only with src.
267                                 for (int i = 1; i < src.Length; i++) {
268                                         int level = GetCombiningClass (src [i]);
269                                         if (level == 0)
270                                                 continue;
271                                         if (GetCombiningClass (src [i - 1]) > level) {
272                                                 sb = new StringBuilder (src.Length);
273                                                 sb.Append (src, 0, src.Length);
274                                                 ReorderCanonical (src, ref sb, i);
275                                                 return;
276                                         }
277                                 }
278                                 return;
279                         }
280                         // check only with sb
281                         for (int i = start; i < sb.Length; ) {
282                                 int level = GetCombiningClass (sb [i]);
283                                 if (level == 0 || GetCombiningClass (sb [i - 1]) <= level) {
284                                         i++;
285                                         continue;
286                                 }
287
288                                 char c = sb [i - 1];
289                                 sb [i - 1] = sb [i];
290                                 sb [i] = c;
291                                 // Apply recursively.
292                                 if (i > 1)
293                                         i--;
294                         }
295                 }
296
297                 static void DecomposeChar (ref StringBuilder sb,
298                         ref int [] buf, string s, int i, int checkType, ref int start)
299                 {
300                         if (sb == null)
301                                 sb = new StringBuilder (s.Length + 100);
302                         sb.Append (s, start, i - start);
303                         if (buf == null)
304                                 buf = new int [19];
305                         int n = GetCanonical (s [i], buf, 0, checkType);
306                         for (int x = 0; x < n; x++) {
307                                 if (buf [x] < char.MaxValue)
308                                         sb.Append ((char) buf [x]);
309                                 else { // surrogate
310                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
311                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
312                                 }
313                         }
314                         start = i + 1;
315                 }
316
317                 public static NormalizationCheck QuickCheck (char c, int type)
318                 {
319                         uint v;
320                         switch (type) {
321                         default: // NFC
322                                 v = PropValue ((int) c);
323                                 return (v & NoNfc) == 0 ?
324                                         (v & MaybeNfc) == 0 ?
325                                         NormalizationCheck.Yes :
326                                         NormalizationCheck.Maybe :
327                                         NormalizationCheck.No;
328                         case 1: // NFD
329                                 if ('\uAC00' <= c && c <= '\uD7A3')
330                                         return NormalizationCheck.No;
331                                 return (PropValue ((int) c) & NoNfd) != 0 ?
332                                         NormalizationCheck.No : NormalizationCheck.Yes;
333                         case 2: // NFKC
334                                 v = PropValue ((int) c);
335                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
336                                         (v & MaybeNfkc) != 0 ?
337                                         NormalizationCheck.Maybe :
338                                         NormalizationCheck.Yes;
339                         case 3: // NFKD
340                                 if ('\uAC00' <= c && c <= '\uD7A3')
341                                         return NormalizationCheck.No;
342                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
343                                         NormalizationCheck.No : NormalizationCheck.Yes;
344                         }
345                 }
346
347                 /* for now we don't use FC_NFKC closure
348                 public static bool IsMultiForm (char c)
349                 {
350                         return (PropValue ((int) c) & 0xF0000000) != 0;
351                 }
352
353                 public static char SingleForm (char c)
354                 {
355                         uint v = PropValue ((int) c);
356                         int idx = (int) ((v & 0x7FFF0000) >> 16);
357                         return (char) singleNorm [idx];
358                 }
359
360                 public static void MultiForm (char c, char [] buf, int index)
361                 {
362                         // FIXME: handle surrogate
363                         uint v = PropValue ((int) c);
364                         int midx = (int) ((v & 0x7FFF0000) >> 16);
365                         buf [index] = (char) multiNorm [midx];
366                         buf [index + 1] = (char) multiNorm [midx + 1];
367                         buf [index + 2] = (char) multiNorm [midx + 2];
368                         buf [index + 3] = (char) multiNorm [midx + 3];
369                         if (buf [index + 3] != 0)
370                                 buf [index + 4] = (char) 0; // zero termination
371                 }
372                 */
373
374                 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
375                                   HangulVBase = 0x1161, HangulTBase = 0x11A7,
376                                   HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
377                                   HangulNCount = HangulVCount * HangulTCount,   // 588
378                                   HangulSCount = HangulLCount * HangulNCount;   // 11172
379
380                 private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
381                 {
382                         int idx = s - HangulSBase;
383                         if (idx < 0 || idx >= HangulSCount) {
384                                 return bufIdx;
385                         }
386
387                         int L = HangulLBase + idx / HangulNCount;
388                         int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
389                         int T = HangulTBase + idx % HangulTCount;
390
391                         buf [bufIdx++] = L;
392                         buf [bufIdx++] = V;
393                         if (T != HangulTBase) {
394                                 buf [bufIdx++] = T;
395                         }
396                         buf [bufIdx] = (char) 0;
397                         return bufIdx;
398                 }
399
400                 static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
401                 {
402                         int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
403                         if (newBufIdx > bufIdx)
404                                 return newBufIdx;
405  
406                         int i = CharMapIdx (c);
407                         if (i == 0 || mappedChars [i] == c)
408                                 buf [bufIdx++] = c;
409                         else {
410                                 // Character c maps to one or more decomposed chars.
411                                 for (; mappedChars [i] != 0; i++) {
412                                         int nth = mappedChars [i];
413
414                                         // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
415                                         // Full decomposition involves recursive application of the
416                                         // Decomposition_Mapping values.  Note that QuickCheck does
417                                         // not currently support astral plane codepoints.
418                                         if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
419                                                 buf [bufIdx++] = nth;
420                                         else
421                                                 bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
422                                 }
423                         }
424
425                         return bufIdx;
426                 }
427
428                 public static bool IsNormalized (string source, int type)
429                 {
430                         int prevCC = -1;
431                         for (int i = 0; i < source.Length; ) {
432                                 int cc = GetCombiningClass (source [i]);
433                                 if (cc != 0 && cc < prevCC)
434                                         return false;
435                                 prevCC = cc;
436
437                                 switch (QuickCheck (source [i], type)) {
438                                 case NormalizationCheck.Yes:
439                                         i++;
440                                         break;
441                                 case NormalizationCheck.No:
442                                         return false;
443                                 case NormalizationCheck.Maybe:
444                                         // for those forms with composition, it cannot be checked here
445                                         switch (type) {
446                                         case 0: // NFC
447                                         case 2: // NFKC
448                                                 return source == Normalize (source, type);
449                                         }
450                                         // go on...
451
452                                         i = CombineHangul (null, source, i > 0 ? i - 1 : i);
453                                         if (i < 0)
454                                                 return false;
455
456                                         i = TryComposeWithPreviousStarter (null, source, i);
457                                         if (i < 0)
458                                                 return false;
459                                         break;
460                                 }
461                         }
462                         return true;
463                 }
464
465                 public static string Normalize (string source, int type)
466                 {
467                         switch (type) {
468                         default:
469                         case 2:
470                                 return Compose (source, type);
471                         case 1:
472                         case 3:
473                                 return Decompose (source, type);
474                         }
475                 }
476
477                 static byte* props;
478                 static int* mappedChars;
479                 static short* charMapIndex;
480                 static short* helperIndex;
481                 static ushort* mapIdxToComposite;
482                 static byte* combiningClass;
483
484 #if GENERATE_TABLE
485
486                 public static readonly bool IsReady = true; // always
487
488                 static Normalization ()
489                 {
490                         fixed (byte* tmp = propsArr) {
491                                 props = tmp;
492                         }
493                         fixed (int* tmp = mappedCharsArr) {
494                                 mappedChars = tmp;
495                         }
496                         fixed (short* tmp = charMapIndexArr) {
497                                 charMapIndex = tmp;
498                         }
499                         fixed (short* tmp = helperIndexArr) {
500                                 helperIndex = tmp;
501                         }
502                         fixed (ushort* tmp = mapIdxToCompositeArr) {
503                                 mapIdxToComposite = tmp;
504                         }
505                         fixed (byte* tmp = combiningClassArr) {
506                                 combiningClass = tmp;
507                         }
508                 }
509 #else
510
511                 static object forLock = new object ();
512                 public static readonly bool isReady;
513
514                 public static bool IsReady {
515                         get { return isReady; }
516                 }
517
518                 [MethodImpl (MethodImplOptions.InternalCall)]
519                 static extern void load_normalization_resource (
520                         out IntPtr props, out IntPtr mappedChars,
521                         out IntPtr charMapIndex, out IntPtr helperIndex,
522                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
523
524                 static Normalization ()
525                 {
526                         IntPtr p1, p2, p3, p4, p5, p6;
527                         lock (forLock) {
528                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
529                                 props = (byte*) p1;
530                                 mappedChars = (int*) p2;
531                                 charMapIndex = (short*) p3;
532                                 helperIndex = (short*) p4;
533                                 mapIdxToComposite = (ushort*) p5;
534                                 combiningClass = (byte*) p6;
535                         }
536
537                         isReady = true;
538                 }
539         }
540 }
541 #endif
542
543                 //
544                 // autogenerated code or icall to fill array runs here
545                 //
546