Merge pull request #1659 from alexanderkyte/stringbuilder-referencesource
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace System.Text
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static byte GetCombiningClass (int c)
42                 {
43                         return combiningClass [NUtil.Combining.ToIndex (c)];
44                 }
45
46                 static int GetPrimaryCompositeFromMapIndex (int src)
47                 {
48                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
49                 }
50
51                 static int GetPrimaryCompositeHelperIndex (int cp)
52                 {
53                         return helperIndex [NUtil.Helper.ToIndex (cp)];
54                 }
55
56                 private static string Compose (string source, int checkType)
57                 {
58                         StringBuilder sb = null;
59                         // Decompose to NFD or NKFD depending on our target
60                         Decompose (source, ref sb, checkType == 2 ? 3 : 1);
61                         if (sb == null)
62                                 sb = Combine (source, 0, checkType);
63                         else
64                                 Combine (sb, 0, checkType);
65
66                         return sb != null ? sb.ToString () : source;
67                 }
68
69                 private static StringBuilder Combine (string source, int start, int checkType)
70                 {
71                         for (int i = 0; i < source.Length; i++) {
72                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
73                                         continue;
74                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
75                                 sb.Append (source);
76                                 Combine (sb, i, checkType);
77                                 return sb;
78                         }
79                         return null;
80                 }
81
82 /*
83                 private static bool CanBePrimaryComposite (int i)
84                 {
85                         if (i >= 0x3400 && i <= 0x9FBB)
86                                 return GetPrimaryCompositeHelperIndex (i) != 0;
87                         return (PropValue (i) & IsUnsafe) != 0;
88                 }
89 */
90                 private static void Combine (StringBuilder sb, int i, int checkType)
91                 {
92                         // Back off one character as we may be looking at a V or T jamo.
93                         CombineHangul (sb, null, i > 0 ? i - 1 : i);
94
95                         while (i < sb.Length) {
96                                 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
97                                         i++;
98                                         continue;
99                                 }
100
101                                 i = TryComposeWithPreviousStarter (sb, null, i);
102                         }
103                 }
104
105                 private static int CombineHangul (StringBuilder sb, string s, int current)
106                 {
107                         int length = sb != null ? sb.Length : s.Length;
108                         int last = Fetch (sb, s, current);
109
110                         for (int i = current + 1; i < length; ++i) {
111                                 int ch = Fetch (sb, s, i);
112
113                                 // 1. check to see if two current characters are L and V
114
115                                 int LIndex = last - HangulLBase;
116                                 if (0 <= LIndex && LIndex < HangulLCount) {
117                                         int VIndex = ch - HangulVBase;
118                                         if (0 <= VIndex && VIndex < HangulVCount) {
119                                                 if (sb == null)
120                                                         return -1;
121
122                                                 // make syllable of form LV
123
124                                                 last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
125
126                                                 sb [i - 1] = (char) last; // reset last
127                                                 sb.Remove (i, 1);
128                                                 i--; length--;
129                                                 continue; // discard ch
130                                         }
131                                 }
132
133
134                                 // 2. check to see if two current characters are LV and T
135
136                                 int SIndex = last - HangulSBase;
137                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
138                                         int TIndex = ch - HangulTBase;
139                                         if (0 < TIndex && TIndex < HangulTCount) {
140                                                 if (sb == null)
141                                                         return -1;
142
143                                                 // make syllable of form LVT
144
145                                                 last += TIndex;
146
147                                                 sb [i - 1] = (char) last; // reset last
148                                                 sb.Remove (i, 1);
149                                                 i--; length--;
150                                                 continue; // discard ch
151                                         }
152                                 }
153                                 // if neither case was true, just add the character
154                                 last = ch;
155                         }
156
157                         return length;
158                 }
159
160                 static int Fetch (StringBuilder sb, string s, int i)
161                 {
162                         return (int) (sb != null ? sb [i] : s [i]);
163                 }
164
165                 // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
166                 static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
167                 {
168                         // Backtrack to previous starter.
169                         int i = current - 1;
170                         if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
171                                 if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
172                                         return current + 1;
173                         } else {
174                                 while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
175                                         i--;
176                                 if (i < 0)
177                                         return current + 1;
178                         }
179
180                         int starter = Fetch (sb, s, i);
181
182                         // The various decompositions involving starter follow this index.
183                         int comp_idx = GetPrimaryCompositeHelperIndex (starter);
184                         if (comp_idx == 0)
185                                 return current + 1;
186
187                         int length = (sb != null ? sb.Length : s.Length);
188                         int prevCombiningClass = -1;
189                         for (int j = i + 1; j < length; j++) {
190                                 int candidate = Fetch (sb, s, j);
191
192                                 int combiningClass = GetCombiningClass (candidate);
193                                 if (combiningClass == prevCombiningClass)
194                                         // We skipped over a guy with the same class, without
195                                         // combining.  Skip this one, too.
196                                         continue;
197
198                                 int composed = TryCompose (comp_idx, starter, candidate);
199                                 if (composed != 0) {
200                                         if (sb == null)
201                                                 // Not normalized, and we are only checking.
202                                                 return -1;
203
204                                         // Full Unicode warning: This will break when the underlying
205                                         // tables are extended.
206                                         sb [i] = (char) composed;
207                                         sb.Remove (j, 1);
208
209                                         return current;
210                                 }
211
212                                 // Gray box.  We're done.
213                                 if (combiningClass == 0)
214                                         return j + 1;
215
216                                 prevCombiningClass = combiningClass;
217                         }
218
219                         return length;
220                 }
221
222                 static int TryCompose (int i, int starter, int candidate)
223                 {
224                         while (mappedChars [i] == starter) {
225                                 if (mappedChars [i + 1] == candidate &&
226                                     mappedChars [i + 2] == 0) {
227                                         int composed = GetPrimaryCompositeFromMapIndex (i);
228
229                                         if ((PropValue (composed) & FullCompositionExclusion) == 0)
230                                                 return composed;
231                                 }
232
233                                 // Skip this entry.
234                                 while (mappedChars [i] != 0)
235                                         i++;
236                                 i++;
237                         }
238
239                         return 0;
240                 }
241
242                 static string Decompose (string source, int checkType)
243                 {
244                         StringBuilder sb = null;
245                         Decompose (source, ref sb, checkType);
246                         return sb != null ? sb.ToString () : source;
247                 }
248
249                 static void Decompose (string source,
250                         ref StringBuilder sb, int checkType)
251                 {
252                         int [] buf = null;
253                         int start = 0;
254                         for (int i = 0; i < source.Length; i++)
255                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
256                                         DecomposeChar (ref sb, ref buf, source,
257                                                 i, checkType, ref start);
258                         if (sb != null)
259                                 sb.Append (source, start, source.Length - start);
260                         ReorderCanonical (source, ref sb, 1);
261                 }
262
263                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
264                 {
265                         if (sb == null) {
266                                 // check only with src.
267                                 for (int i = 1; i < src.Length; i++) {
268                                         int level = GetCombiningClass (src [i]);
269                                         if (level == 0)
270                                                 continue;
271                                         if (GetCombiningClass (src [i - 1]) > level) {
272                                                 sb = new StringBuilder (src.Length);
273                                                 sb.Append (src, 0, src.Length);
274                                                 ReorderCanonical (src, ref sb, i);
275                                                 return;
276                                         }
277                                 }
278                                 return;
279                         }
280                         // check only with sb
281                         for (int i = start; i < sb.Length; ) {
282                                 int level = GetCombiningClass (sb [i]);
283                                 if (level == 0 || GetCombiningClass (sb [i - 1]) <= level) {
284                                         i++;
285                                         continue;
286                                 }
287
288                                 char c = sb [i - 1];
289                                 sb [i - 1] = sb [i];
290                                 sb [i] = c;
291                                 // Apply recursively.
292                                 if (i > 1)
293                                         i--;
294                         }
295                 }
296
297                 static void DecomposeChar (ref StringBuilder sb,
298                         ref int [] buf, string s, int i, int checkType, ref int start)
299                 {
300                         if (sb == null)
301                                 sb = new StringBuilder (s.Length + 100);
302                         sb.Append (s, start, i - start);
303                         if (buf == null)
304                                 buf = new int [19];
305                         int n = GetCanonical (s [i], buf, 0, checkType);
306                         for (int x = 0; x < n; x++) {
307                                 if (buf [x] < char.MaxValue)
308                                         sb.Append ((char) buf [x]);
309                                 else { // surrogate
310                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
311                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
312                                 }
313                         }
314                         start = i + 1;
315                 }
316
317                 public static NormalizationCheck QuickCheck (char c, int type)
318                 {
319                         uint v;
320                         switch (type) {
321                         default: // NFC
322                                 v = PropValue ((int) c);
323                                 return (v & NoNfc) == 0 ?
324                                         (v & MaybeNfc) == 0 ?
325                                         NormalizationCheck.Yes :
326                                         NormalizationCheck.Maybe :
327                                         NormalizationCheck.No;
328                         case 1: // NFD
329                                 if ('\uAC00' <= c && c <= '\uD7A3')
330                                         return NormalizationCheck.No;
331                                 return (PropValue ((int) c) & NoNfd) != 0 ?
332                                         NormalizationCheck.No : NormalizationCheck.Yes;
333                         case 2: // NFKC
334                                 v = PropValue ((int) c);
335                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
336                                         (v & MaybeNfkc) != 0 ?
337                                         NormalizationCheck.Maybe :
338                                         NormalizationCheck.Yes;
339                         case 3: // NFKD
340                                 if ('\uAC00' <= c && c <= '\uD7A3')
341                                         return NormalizationCheck.No;
342                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
343                                         NormalizationCheck.No : NormalizationCheck.Yes;
344                         }
345                 }
346
347                 /* for now we don't use FC_NFKC closure
348                 public static bool IsMultiForm (char c)
349                 {
350                         return (PropValue ((int) c) & 0xF0000000) != 0;
351                 }
352
353                 public static char SingleForm (char c)
354                 {
355                         uint v = PropValue ((int) c);
356                         int idx = (int) ((v & 0x7FFF0000) >> 16);
357                         return (char) singleNorm [idx];
358                 }
359
360                 public static void MultiForm (char c, char [] buf, int index)
361                 {
362                         // FIXME: handle surrogate
363                         uint v = PropValue ((int) c);
364                         int midx = (int) ((v & 0x7FFF0000) >> 16);
365                         buf [index] = (char) multiNorm [midx];
366                         buf [index + 1] = (char) multiNorm [midx + 1];
367                         buf [index + 2] = (char) multiNorm [midx + 2];
368                         buf [index + 3] = (char) multiNorm [midx + 3];
369                         if (buf [index + 3] != 0)
370                                 buf [index + 4] = (char) 0; // zero termination
371                 }
372                 */
373
374                 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
375                                   HangulVBase = 0x1161, HangulTBase = 0x11A7,
376                                   HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
377                                   HangulNCount = HangulVCount * HangulTCount,   // 588
378                                   HangulSCount = HangulLCount * HangulNCount;   // 11172
379
380                 private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
381                 {
382                         int idx = s - HangulSBase;
383                         if (idx < 0 || idx >= HangulSCount) {
384                                 return bufIdx;
385                         }
386
387                         int L = HangulLBase + idx / HangulNCount;
388                         int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
389                         int T = HangulTBase + idx % HangulTCount;
390
391                         buf [bufIdx++] = L;
392                         buf [bufIdx++] = V;
393                         if (T != HangulTBase) {
394                                 buf [bufIdx++] = T;
395                         }
396                         buf [bufIdx] = (char) 0;
397                         return bufIdx;
398                 }
399
400                 static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
401                 {
402                         int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
403                         if (newBufIdx > bufIdx)
404                                 return newBufIdx;
405  
406                         int i = CharMapIdx (c);
407                         if (i == 0 || mappedChars [i] == c)
408                                 buf [bufIdx++] = c;
409                         else {
410                                 // Character c maps to one or more decomposed chars.
411                                 for (; mappedChars [i] != 0; i++) {
412                                         int nth = mappedChars [i];
413
414                                         // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
415                                         // Full decomposition involves recursive application of the
416                                         // Decomposition_Mapping values.  Note that QuickCheck does
417                                         // not currently support astral plane codepoints.
418                                         if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
419                                                 buf [bufIdx++] = nth;
420                                         else
421                                                 bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
422                                 }
423                         }
424
425                         return bufIdx;
426                 }
427
428                 public static bool IsNormalized (string source, NormalizationForm normalizationForm)
429                 {
430                         switch (normalizationForm) {
431                         default:
432                                 return IsNormalized (source, 0);
433                         case NormalizationForm.FormD:
434                                 return IsNormalized (source, 1);
435                         case NormalizationForm.FormKC:
436                                 return IsNormalized (source, 2);
437                         case NormalizationForm.FormKD:
438                                 return IsNormalized (source, 3);
439                         }
440                 }
441
442                 public static bool IsNormalized (string source, int type)
443                 {
444                         int prevCC = -1;
445                         for (int i = 0; i < source.Length; ) {
446                                 int cc = GetCombiningClass (source [i]);
447                                 if (cc != 0 && cc < prevCC)
448                                         return false;
449                                 prevCC = cc;
450
451                                 switch (QuickCheck (source [i], type)) {
452                                 case NormalizationCheck.Yes:
453                                         i++;
454                                         break;
455                                 case NormalizationCheck.No:
456                                         return false;
457                                 case NormalizationCheck.Maybe:
458                                         // for those forms with composition, it cannot be checked here
459                                         switch (type) {
460                                         case 0: // NFC
461                                         case 2: // NFKC
462                                                 return source == Normalize (source, type);
463                                         }
464                                         // go on...
465
466                                         i = CombineHangul (null, source, i > 0 ? i - 1 : i);
467                                         if (i < 0)
468                                                 return false;
469
470                                         i = TryComposeWithPreviousStarter (null, source, i);
471                                         if (i < 0)
472                                                 return false;
473                                         break;
474                                 }
475                         }
476                         return true;
477                 }
478
479                 public static string Normalize (string source, NormalizationForm normalizationForm)
480                 {
481                         switch (normalizationForm) {
482                         default:
483                                 return Normalization.Normalize (source, 0);
484                         case NormalizationForm.FormD:
485                                 return Normalization.Normalize (source, 1);
486                         case NormalizationForm.FormKC:
487                                 return Normalization.Normalize (source, 2);
488                         case NormalizationForm.FormKD:
489                                 return Normalization.Normalize (source, 3);
490                         }
491                 }
492
493                 public static string Normalize (string source, int type)
494                 {
495                         switch (type) {
496                         default:
497                         case 2:
498                                 return Compose (source, type);
499                         case 1:
500                         case 3:
501                                 return Decompose (source, type);
502                         }
503                 }
504
505                 static byte* props;
506                 static int* mappedChars;
507                 static short* charMapIndex;
508                 static short* helperIndex;
509                 static ushort* mapIdxToComposite;
510                 static byte* combiningClass;
511
512 #if GENERATE_TABLE
513
514                 public static readonly bool IsReady = true; // always
515
516                 static Normalization ()
517                 {
518                         fixed (byte* tmp = propsArr) {
519                                 props = tmp;
520                         }
521                         fixed (int* tmp = mappedCharsArr) {
522                                 mappedChars = tmp;
523                         }
524                         fixed (short* tmp = charMapIndexArr) {
525                                 charMapIndex = tmp;
526                         }
527                         fixed (short* tmp = helperIndexArr) {
528                                 helperIndex = tmp;
529                         }
530                         fixed (ushort* tmp = mapIdxToCompositeArr) {
531                                 mapIdxToComposite = tmp;
532                         }
533                         fixed (byte* tmp = combiningClassArr) {
534                                 combiningClass = tmp;
535                         }
536                 }
537 #else
538
539                 static object forLock = new object ();
540                 public static readonly bool isReady;
541
542                 public static bool IsReady {
543                         get { return isReady; }
544                 }
545
546                 [MethodImpl (MethodImplOptions.InternalCall)]
547                 static extern void load_normalization_resource (
548                         out IntPtr props, out IntPtr mappedChars,
549                         out IntPtr charMapIndex, out IntPtr helperIndex,
550                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
551
552                 static Normalization ()
553                 {
554                         IntPtr p1, p2, p3, p4, p5, p6;
555                         lock (forLock) {
556                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
557                                 props = (byte*) p1;
558                                 mappedChars = (int*) p2;
559                                 charMapIndex = (short*) p3;
560                                 helperIndex = (short*) p4;
561                                 mapIdxToComposite = (ushort*) p5;
562                                 combiningClass = (byte*) p6;
563                         }
564
565                         isReady = true;
566                 }
567         }
568 }
569 #endif
570
571                 //
572                 // autogenerated code or icall to fill array runs here
573                 //
574