8d7fcfc84d4c8191737ecd132094d8136bec1af7
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace Mono.Globalization.Unicode
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static byte GetCombiningClass (int c)
42                 {
43                         return combiningClass [NUtil.Combining.ToIndex (c)];
44                 }
45
46                 static int GetPrimaryCompositeFromMapIndex (int src)
47                 {
48                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
49                 }
50
51                 static int GetPrimaryCompositeHelperIndex (int cp)
52                 {
53                         return helperIndex [NUtil.Helper.ToIndex (cp)];
54                 }
55
56                 private static string Compose (string source, int checkType)
57                 {
58                         StringBuilder sb = null;
59                         Decompose (source, ref sb, checkType);
60                         if (sb == null)
61                                 sb = Combine (source, 0, checkType);
62                         else
63                                 Combine (sb, 0, checkType);
64
65                         return sb != null ? sb.ToString () : source;
66                 }
67
68                 private static StringBuilder Combine (string source, int start, int checkType)
69                 {
70                         for (int i = 0; i < source.Length; i++) {
71                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
72                                         continue;
73                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
74                                 sb.Append (source);
75                                 Combine (sb, i, checkType);
76                                 return sb;
77                         }
78                         return null;
79                 }
80
81 /*
82                 private static bool CanBePrimaryComposite (int i)
83                 {
84                         if (i >= 0x3400 && i <= 0x9FBB)
85                                 return GetPrimaryCompositeHelperIndex (i) != 0;
86                         return (PropValue (i) & IsUnsafe) != 0;
87                 }
88 */
89                 private static void Combine (StringBuilder sb, int i, int checkType)
90                 {
91                         // Back off one character as we may be looking at a V or T jamo.
92                         CombineHangul (sb, null, i > 0 ? i - 1 : i);
93
94                         while (i < sb.Length) {
95                                 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
96                                         i++;
97                                         continue;
98                                 }
99
100                                 i = TryComposeWithPreviousStarter (sb, null, i);
101                         }
102                 }
103
104                 private static int CombineHangul (StringBuilder sb, string s, int current)
105                 {
106                         int length = sb != null ? sb.Length : s.Length;
107                         int last = Fetch (sb, s, current);
108
109                         for (int i = current + 1; i < length; ++i) {
110                                 int ch = Fetch (sb, s, i);
111
112                                 // 1. check to see if two current characters are L and V
113
114                                 int LIndex = last - HangulLBase;
115                                 if (0 <= LIndex && LIndex < HangulLCount) {
116                                         int VIndex = ch - HangulVBase;
117                                         if (0 <= VIndex && VIndex < HangulVCount) {
118                                                 if (sb == null)
119                                                         return -1;
120
121                                                 // make syllable of form LV
122
123                                                 last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
124
125                                                 sb [i - 1] = (char) last; // reset last
126                                                 sb.Remove (i, 1);
127                                                 i--; length--;
128                                                 continue; // discard ch
129                                         }
130                                 }
131
132
133                                 // 2. check to see if two current characters are LV and T
134
135                                 int SIndex = last - HangulSBase;
136                                 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
137                                         int TIndex = ch - HangulTBase;
138                                         if (0 < TIndex && TIndex < HangulTCount) {
139                                                 if (sb == null)
140                                                         return -1;
141
142                                                 // make syllable of form LVT
143
144                                                 last += TIndex;
145
146                                                 sb [i - 1] = (char) last; // reset last
147                                                 sb.Remove (i, 1);
148                                                 i--; length--;
149                                                 continue; // discard ch
150                                         }
151                                 }
152                                 // if neither case was true, just add the character
153                                 last = ch;
154                         }
155
156                         return length;
157                 }
158
159                 static int Fetch (StringBuilder sb, string s, int i)
160                 {
161                         return (int) (sb != null ? sb [i] : s [i]);
162                 }
163
164                 // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
165                 static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
166                 {
167                         // Backtrack to previous starter.
168                         int i = current - 1;
169                         if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
170                                 if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
171                                         return current + 1;
172                         } else {
173                                 while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
174                                         i--;
175                                 if (i < 0)
176                                         return current + 1;
177                         }
178
179                         int starter = Fetch (sb, s, i);
180
181                         // The various decompositions involving starter follow this index.
182                         int comp_idx = GetPrimaryCompositeHelperIndex (starter);
183                         if (comp_idx == 0)
184                                 return current + 1;
185
186                         int length = (sb != null ? sb.Length : s.Length);
187                         int prevCombiningClass = -1;
188                         for (int j = i + 1; j < length; j++) {
189                                 int candidate = Fetch (sb, s, j);
190
191                                 int combiningClass = GetCombiningClass (candidate);
192                                 if (combiningClass == prevCombiningClass)
193                                         // We skipped over a guy with the same class, without
194                                         // combining.  Skip this one, too.
195                                         continue;
196
197                                 int composed = TryCompose (comp_idx, starter, candidate);
198                                 if (composed != 0) {
199                                         if (sb == null)
200                                                 // Not normalized, and we are only checking.
201                                                 return -1;
202
203                                         // Full Unicode warning: This will break when the underlying
204                                         // tables are extended.
205                                         sb [i] = (char) composed;
206                                         sb.Remove (j, 1);
207
208                                         return current;
209                                 }
210
211                                 // Gray box.  We're done.
212                                 if (combiningClass == 0)
213                                         return j + 1;
214
215                                 prevCombiningClass = combiningClass;
216                         }
217
218                         return length;
219                 }
220
221                 static int TryCompose (int i, int starter, int candidate)
222                 {
223                         while (mappedChars [i] == starter) {
224                                 if (mappedChars [i + 1] == candidate &&
225                                     mappedChars [i + 2] == 0) {
226                                         int composed = GetPrimaryCompositeFromMapIndex (i);
227
228                                         if ((PropValue (composed) & FullCompositionExclusion) == 0)
229                                                 return composed;
230                                 }
231
232                                 // Skip this entry.
233                                 while (mappedChars [i] != 0)
234                                         i++;
235                                 i++;
236                         }
237
238                         return 0;
239                 }
240
241                 static string Decompose (string source, int checkType)
242                 {
243                         StringBuilder sb = null;
244                         Decompose (source, ref sb, checkType);
245                         return sb != null ? sb.ToString () : source;
246                 }
247
248                 static void Decompose (string source,
249                         ref StringBuilder sb, int checkType)
250                 {
251                         int [] buf = null;
252                         int start = 0;
253                         for (int i = 0; i < source.Length; i++)
254                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
255                                         DecomposeChar (ref sb, ref buf, source,
256                                                 i, checkType, ref start);
257                         if (sb != null)
258                                 sb.Append (source, start, source.Length - start);
259                         ReorderCanonical (source, ref sb, 1);
260                 }
261
262                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
263                 {
264                         if (sb == null) {
265                                 // check only with src.
266                                 for (int i = 1; i < src.Length; i++) {
267                                         int level = GetCombiningClass (src [i]);
268                                         if (level == 0)
269                                                 continue;
270                                         if (GetCombiningClass (src [i - 1]) > level) {
271                                                 sb = new StringBuilder (src.Length);
272                                                 sb.Append (src, 0, src.Length);
273                                                 ReorderCanonical (src, ref sb, i);
274                                                 return;
275                                         }
276                                 }
277                                 return;
278                         }
279                         // check only with sb
280                         for (int i = start; i < sb.Length; i++) {
281                                 int level = GetCombiningClass (sb [i]);
282                                 if (level == 0)
283                                         continue;
284                                 if (GetCombiningClass (sb [i - 1]) > level) {
285                                         char c = sb [i - 1];
286                                         sb [i - 1] = sb [i];
287                                         sb [i] = c;
288                                         i--; // apply recursively
289                                 }
290                         }
291                 }
292
293                 static void DecomposeChar (ref StringBuilder sb,
294                         ref int [] buf, string s, int i, int checkType, ref int start)
295                 {
296                         if (sb == null)
297                                 sb = new StringBuilder (s.Length + 100);
298                         sb.Append (s, start, i - start);
299                         if (buf == null)
300                                 buf = new int [19];
301                         int n = GetCanonical (s [i], buf, 0, checkType);
302                         for (int x = 0; x < n; x++) {
303                                 if (buf [x] < char.MaxValue)
304                                         sb.Append ((char) buf [x]);
305                                 else { // surrogate
306                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
307                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
308                                 }
309                         }
310                         start = i + 1;
311                 }
312
313                 public static NormalizationCheck QuickCheck (char c, int type)
314                 {
315                         uint v;
316                         switch (type) {
317                         default: // NFC
318                                 v = PropValue ((int) c);
319                                 return (v & NoNfc) == 0 ?
320                                         (v & MaybeNfc) == 0 ?
321                                         NormalizationCheck.Yes :
322                                         NormalizationCheck.Maybe :
323                                         NormalizationCheck.No;
324                         case 1: // NFD
325                                 if ('\uAC00' <= c && c <= '\uD7A3')
326                                         return NormalizationCheck.No;
327                                 return (PropValue ((int) c) & NoNfd) != 0 ?
328                                         NormalizationCheck.No : NormalizationCheck.Yes;
329                         case 2: // NFKC
330                                 v = PropValue ((int) c);
331                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
332                                         (v & MaybeNfkc) != 0 ?
333                                         NormalizationCheck.Maybe :
334                                         NormalizationCheck.Yes;
335                         case 3: // NFKD
336                                 if ('\uAC00' <= c && c <= '\uD7A3')
337                                         return NormalizationCheck.No;
338                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
339                                         NormalizationCheck.No : NormalizationCheck.Yes;
340                         }
341                 }
342
343                 /* for now we don't use FC_NFKC closure
344                 public static bool IsMultiForm (char c)
345                 {
346                         return (PropValue ((int) c) & 0xF0000000) != 0;
347                 }
348
349                 public static char SingleForm (char c)
350                 {
351                         uint v = PropValue ((int) c);
352                         int idx = (int) ((v & 0x7FFF0000) >> 16);
353                         return (char) singleNorm [idx];
354                 }
355
356                 public static void MultiForm (char c, char [] buf, int index)
357                 {
358                         // FIXME: handle surrogate
359                         uint v = PropValue ((int) c);
360                         int midx = (int) ((v & 0x7FFF0000) >> 16);
361                         buf [index] = (char) multiNorm [midx];
362                         buf [index + 1] = (char) multiNorm [midx + 1];
363                         buf [index + 2] = (char) multiNorm [midx + 2];
364                         buf [index + 3] = (char) multiNorm [midx + 3];
365                         if (buf [index + 3] != 0)
366                                 buf [index + 4] = (char) 0; // zero termination
367                 }
368                 */
369
370                 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
371                                   HangulVBase = 0x1161, HangulTBase = 0x11A7,
372                                   HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
373                                   HangulNCount = HangulVCount * HangulTCount,   // 588
374                                   HangulSCount = HangulLCount * HangulNCount;   // 11172
375
376                 private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
377                 {
378                         int idx = s - HangulSBase;
379                         if (idx < 0 || idx >= HangulSCount) {
380                                 return bufIdx;
381                         }
382
383                         int L = HangulLBase + idx / HangulNCount;
384                         int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
385                         int T = HangulTBase + idx % HangulTCount;
386
387                         buf [bufIdx++] = L;
388                         buf [bufIdx++] = V;
389                         if (T != HangulTBase) {
390                                 buf [bufIdx++] = T;
391                         }
392                         buf [bufIdx] = (char) 0;
393                         return bufIdx;
394                 }
395
396                 static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
397                 {
398                         int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
399                         if (newBufIdx > bufIdx)
400                                 return newBufIdx;
401  
402                         int i = CharMapIdx (c);
403                         if (i == 0 || mappedChars [i] == c)
404                                 buf [bufIdx++] = c;
405                         else {
406                                 // Character c maps to one or more decomposed chars.
407                                 for (; mappedChars [i] != 0; i++) {
408                                         int nth = mappedChars [i];
409
410                                         // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
411                                         // Full decomposition involves recursive application of the
412                                         // Decomposition_Mapping values.  Note that QuickCheck does
413                                         // not currently support astral plane codepoints.
414                                         if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
415                                                 buf [bufIdx++] = nth;
416                                         else
417                                                 bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
418                                 }
419                         }
420
421                         return bufIdx;
422                 }
423
424                 public static bool IsNormalized (string source, int type)
425                 {
426                         int prevCC = -1;
427                         for (int i = 0; i < source.Length; ) {
428                                 int cc = GetCombiningClass (source [i]);
429                                 if (cc != 0 && cc < prevCC)
430                                         return false;
431                                 prevCC = cc;
432
433                                 switch (QuickCheck (source [i], type)) {
434                                 case NormalizationCheck.Yes:
435                                         i++;
436                                         break;
437                                 case NormalizationCheck.No:
438                                         return false;
439                                 case NormalizationCheck.Maybe:
440                                         // for those forms with composition, it cannot be checked here
441                                         switch (type) {
442                                         case 0: // NFC
443                                         case 2: // NFKC
444                                                 return source == Normalize (source, type);
445                                         }
446                                         // go on...
447
448                                         i = CombineHangul (null, source, i > 0 ? i - 1 : i);
449                                         if (i < 0)
450                                                 return false;
451
452                                         i = TryComposeWithPreviousStarter (null, source, i);
453                                         if (i < 0)
454                                                 return false;
455                                         break;
456                                 }
457                         }
458                         return true;
459                 }
460
461                 public static string Normalize (string source, int type)
462                 {
463                         switch (type) {
464                         default:
465                         case 2:
466                                 return Compose (source, type);
467                         case 1:
468                         case 3:
469                                 return Decompose (source, type);
470                         }
471                 }
472
473                 static byte* props;
474                 static int* mappedChars;
475                 static short* charMapIndex;
476                 static short* helperIndex;
477                 static ushort* mapIdxToComposite;
478                 static byte* combiningClass;
479
480 #if GENERATE_TABLE
481
482                 public static readonly bool IsReady = true; // always
483
484                 static Normalization ()
485                 {
486                         fixed (byte* tmp = propsArr) {
487                                 props = tmp;
488                         }
489                         fixed (int* tmp = mappedCharsArr) {
490                                 mappedChars = tmp;
491                         }
492                         fixed (short* tmp = charMapIndexArr) {
493                                 charMapIndex = tmp;
494                         }
495                         fixed (short* tmp = helperIndexArr) {
496                                 helperIndex = tmp;
497                         }
498                         fixed (ushort* tmp = mapIdxToCompositeArr) {
499                                 mapIdxToComposite = tmp;
500                         }
501                         fixed (byte* tmp = combiningClassArr) {
502                                 combiningClass = tmp;
503                         }
504                 }
505 #else
506
507                 static object forLock = new object ();
508                 public static readonly bool isReady;
509
510                 public static bool IsReady {
511                         get { return isReady; }
512                 }
513
514                 [MethodImpl (MethodImplOptions.InternalCall)]
515                 static extern void load_normalization_resource (
516                         out IntPtr props, out IntPtr mappedChars,
517                         out IntPtr charMapIndex, out IntPtr helperIndex,
518                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
519
520                 static Normalization ()
521                 {
522                         IntPtr p1, p2, p3, p4, p5, p6;
523                         lock (forLock) {
524                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
525                                 props = (byte*) p1;
526                                 mappedChars = (int*) p2;
527                                 charMapIndex = (short*) p3;
528                                 helperIndex = (short*) p4;
529                                 mapIdxToComposite = (ushort*) p5;
530                                 combiningClass = (byte*) p6;
531                         }
532
533                         isReady = true;
534                 }
535         }
536 }
537 #endif
538
539                 //
540                 // autogenerated code or icall to fill array runs here
541                 //
542