648ba04dfaea9d8fe65492308791ff92d4366005
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace Mono.Globalization.Unicode
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static int GetComposedStringLength (int ch)
42                 {
43                         int start = charMapIndex [NUtil.MapIdx (ch)];
44                         int i = start;
45                         while (mappedChars [i] != 0)
46                                 i++;
47                         return i - start;
48                 }
49
50                 static byte GetCombiningClass (int c)
51                 {
52                         return combiningClass [NUtil.Combining.ToIndex (c)];
53                 }
54
55                 static int GetPrimaryCompositeFromMapIndex (int src)
56                 {
57                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
58                 }
59
60                 static int GetPrimaryCompositeHelperIndex (int cp)
61                 {
62                         int originalMapIndex = helperIndex [NUtil.Helper.ToIndex (cp)]; // it returns an index at uncompressed state.
63                         return NUtil.Map.ToIndex (originalMapIndex);
64                 }
65
66                 static int GetPrimaryCompositeCharIndex (object chars, int start)
67                 {
68                         string s = chars as string;
69                         StringBuilder sb = chars as StringBuilder;
70                         char startCh = s != null ? s [start] : sb [start];
71                         int charsLength = sb != null ? sb.Length : s.Length;
72
73                         int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
74                         if (idx == 0)
75                                 return 0;
76                         while (mappedChars [idx] == startCh) {
77                                 for (int i = 1, j = 1; ; i++, j++) {
78                                         if (mappedChars [idx + i] == 0)
79                                                 // matched
80                                                 return idx;
81                                         if (start + i >= charsLength)
82                                                 return 0; // didn't match
83
84                                         // handle blocked characters here.
85                                         char curCh;
86                                         int combiningClass;
87                                         int nextCB = 0;
88                                         do {
89                                                 curCh = s != null ?
90                                                         s [start + j] :
91                                                         sb [start + j];
92                                                 combiningClass = GetCombiningClass (curCh);
93                                                 if (++j + start >= charsLength ||
94                                                         combiningClass == 0)
95                                                         break;
96                                                 nextCB = GetCombiningClass (
97                                                         s != null ?
98                                                         s [start + j] :
99                                                         sb [start + j]);
100                                         } while (nextCB > 0 && combiningClass >= nextCB);
101                                         j--;
102                                         if (mappedChars [idx + i] == curCh)
103                                                 continue;
104                                         if (mappedChars [idx + i] > curCh)
105                                                 return 0; // no match
106                                         // otherwise move idx to next item
107                                         while (mappedChars [i] != 0)
108                                                 i++;
109                                         idx += i + 1;
110                                         break;
111                                 }
112                         }
113                         // reached to end of entries
114                         return 0;
115                 }
116
117                 private static string Compose (string source, int checkType)
118                 {
119                         StringBuilder sb = null;
120                         Decompose (source, ref sb, checkType);
121                         if (sb == null)
122                                 sb = Combine (source, 0, checkType);
123                         else
124                                 Combine (sb, 0, checkType);
125
126                         return sb != null ? sb.ToString () : source;
127                 }
128
129                 private static StringBuilder Combine (string source, int start, int checkType)
130                 {
131                         for (int i = 0; i < source.Length; i++) {
132                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
133                                         continue;
134                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
135                                 sb.Append (source);
136                                 Combine (sb, i, checkType);
137                                 return sb;
138                         }
139                         return null;
140                 }
141
142                 private static bool CanBePrimaryComposite (int i)
143                 {
144                         if (i >= 0x3400 && i <= 0x9FBB)
145                                 return GetPrimaryCompositeHelperIndex (i) != 0;
146                         return (PropValue (i) & IsUnsafe) != 0;
147                 }
148
149                 private static void Combine (StringBuilder sb, int start, int checkType)
150                 {
151                         for (int i = start; i < sb.Length; i++) {
152                                 switch (QuickCheck (sb [i], checkType)) {
153                                 case NormalizationCheck.Yes:
154                                         continue;
155                                 case NormalizationCheck.No:
156                                         break;
157                                 case NormalizationCheck.Maybe:
158                                         if (i == 0)
159                                                 continue;
160                                         else
161                                                 break;
162                                 }
163
164                                 int cur = i;
165                                 // FIXME: It should check "blocked" too
166                                 for (;i >= 0; i--)
167                                         if (!CanBePrimaryComposite ((int) sb [i]))
168                                                 break;
169                                 i++;
170                                 int idx = 0;
171                                 for (; i < cur; i++) {
172                                         idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
173                                         if (idx > 0)
174                                                 break;
175                                 }
176                                 if (idx == 0) {
177                                         i = cur;
178                                         continue;
179                                 }
180                                 int ch = GetPrimaryCompositeFromMapIndex (idx);
181                                 int len = GetComposedStringLength (ch);
182                                 if (ch == 0 || len == 0)
183                                         throw new SystemException ("Internal error: should not happen.");
184                                 int removed = 0;
185                                 sb.Insert (i++, (char) ch); // always single character
186
187                                 // handle blocked characters here.
188                                 while (removed < len) {
189                                         if (i + 1 < sb.Length) {
190                                                 int cb = GetCombiningClass (sb [i]);
191                                                 if (cb > 0) {
192                                                         int next = GetCombiningClass (sb [i + 1]);
193                                                         if (next != 0 && cb >= next) {
194                                                                 i++;
195                                                                 continue;
196                                                         }
197                                                 }
198                                         }
199                                         sb.Remove (i, 1);
200                                         removed++;
201                                 }
202                                 i = cur - 1; // apply recursively
203                         }
204                 }
205
206                 static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
207                 {
208                         if ((PropValue (cur) & FullCompositionExclusion) != 0)
209                                 return 0;
210                         if (GetCombiningClass (cur) != 0)
211                                 return 0; // not a starter
212                         return GetPrimaryCompositeCharIndex (o, bufferPos);
213                 }
214
215                 static string Decompose (string source, int checkType)
216                 {
217                         StringBuilder sb = null;
218                         Decompose (source, ref sb, checkType);
219                         return sb != null ? sb.ToString () : source;
220                 }
221
222                 static void Decompose (string source,
223                         ref StringBuilder sb, int checkType)
224                 {
225                         int [] buf = null;
226                         int start = 0;
227                         for (int i = 0; i < source.Length; i++)
228                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
229                                         DecomposeChar (ref sb, ref buf, source,
230                                                 i, ref start);
231                         if (sb != null)
232                                 sb.Append (source, start, source.Length - start);
233                         ReorderCanonical (source, ref sb, 1);
234                 }
235
236                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
237                 {
238                         if (sb == null) {
239                                 // check only with src.
240                                 for (int i = 1; i < src.Length; i++) {
241                                         int level = GetCombiningClass (src [i]);
242                                         if (level == 0)
243                                                 continue;
244                                         if (GetCombiningClass (src [i - 1]) > level) {
245                                                 sb = new StringBuilder (src.Length);
246                                                 sb.Append (src, 0, src.Length);
247                                                 ReorderCanonical (src, ref sb, i);
248                                                 return;
249                                         }
250                                 }
251                                 return;
252                         }
253                         // check only with sb
254                         for (int i = start; i < sb.Length; i++) {
255                                 int level = GetCombiningClass (sb [i]);
256                                 if (level == 0)
257                                         continue;
258                                 if (GetCombiningClass (sb [i - 1]) > level) {
259                                         char c = sb [i - 1];
260                                         sb [i - 1] = sb [i];
261                                         sb [i] = c;
262                                         i--; // apply recursively
263                                 }
264                         }
265                 }
266
267                 static void DecomposeChar (ref StringBuilder sb,
268                         ref int [] buf, string s, int i, ref int start)
269                 {
270                         if (sb == null)
271                                 sb = new StringBuilder (s.Length + 100);
272                         sb.Append (s, start, i - start);
273                         if (buf == null)
274                                 buf = new int [19];
275                         GetCanonical (s [i], buf, 0);
276                         for (int x = 0; ; x++) {
277                                 if (buf [x] == 0)
278                                         break;
279                                 if (buf [x] < char.MaxValue)
280                                         sb.Append ((char) buf [x]);
281                                 else { // surrogate
282                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
283                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
284                                 }
285                         }
286                         start = i + 1;
287                 }
288
289                 public static NormalizationCheck QuickCheck (char c, int type)
290                 {
291                         uint v;
292                         switch (type) {
293                         default: // NFC
294                                 v = PropValue ((int) c);
295                                 return (v & NoNfc) == 0 ?
296                                         (v & MaybeNfc) == 0 ?
297                                         NormalizationCheck.Yes :
298                                         NormalizationCheck.Maybe :
299                                         NormalizationCheck.No;
300                         case 1: // NFD
301                                 if ('\uAC00' <= c && c <= '\uD7A3')
302                                         return NormalizationCheck.No;
303                                 return (PropValue ((int) c) & NoNfd) != 0 ?
304                                         NormalizationCheck.No : NormalizationCheck.Yes;
305                         case 2: // NFKC
306                                 v = PropValue ((int) c);
307                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
308                                         (v & MaybeNfkc) != 0 ?
309                                         NormalizationCheck.Maybe :
310                                         NormalizationCheck.Yes;
311                         case 3: // NFKD
312                                 if ('\uAC00' <= c && c <= '\uD7A3')
313                                         return NormalizationCheck.No;
314                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
315                                         NormalizationCheck.No : NormalizationCheck.Yes;
316                         }
317                 }
318
319                 /* for now we don't use FC_NFKC closure
320                 public static bool IsMultiForm (char c)
321                 {
322                         return (PropValue ((int) c) & 0xF0000000) != 0;
323                 }
324
325                 public static char SingleForm (char c)
326                 {
327                         uint v = PropValue ((int) c);
328                         int idx = (int) ((v & 0x7FFF0000) >> 16);
329                         return (char) singleNorm [idx];
330                 }
331
332                 public static void MultiForm (char c, char [] buf, int index)
333                 {
334                         // FIXME: handle surrogate
335                         uint v = PropValue ((int) c);
336                         int midx = (int) ((v & 0x7FFF0000) >> 16);
337                         buf [index] = (char) multiNorm [midx];
338                         buf [index + 1] = (char) multiNorm [midx + 1];
339                         buf [index + 2] = (char) multiNorm [midx + 2];
340                         buf [index + 3] = (char) multiNorm [midx + 3];
341                         if (buf [index + 3] != 0)
342                                 buf [index + 4] = (char) 0; // zero termination
343                 }
344                 */
345
346                 public static void GetCanonical (int c, int [] buf, int bufIdx)
347                 {
348                         for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
349                                 buf [bufIdx++] = mappedChars [i];
350                         buf [bufIdx] = (char) 0;
351                 }
352
353                 public static bool IsNormalized (string source, int type)
354                 {
355                         int prevCC = -1;
356                         for (int i = 0; i < source.Length; i++) {
357                                 int cc = GetCombiningClass (source [i]);
358                                 if (cc != 0 && cc < prevCC)
359                                         return false;
360                                 prevCC = cc;
361                                 switch (QuickCheck (source [i], type)) {
362                                 case NormalizationCheck.Yes:
363                                         break;
364                                 case NormalizationCheck.No:
365                                         return false;
366                                 case NormalizationCheck.Maybe:
367                                         // for those forms with composition, it cannot be checked here
368                                         switch (type) {
369                                         case 0: // NFC
370                                         case 2: // NFKC
371                                                 return source == Normalize (source, type);
372                                         }
373                                         // go on...
374                                         
375                                         // partly copied from Combine()
376                                         int cur = i;
377                                         // FIXME: It should check "blocked" too
378                                         for (;i >= 0; i--)
379                                                 if (!CanBePrimaryComposite ((int) source [i]))
380                                                         break;
381                                         i++;
382                                         // Now i is the "starter"
383                                         for (; i < cur; i++) {
384                                                 if (GetPrimaryCompositeCharIndex (source, i) != 0)
385                                                         return false;
386                                         }
387                                         break;
388                                 }
389                         }
390                         return true;
391                 }
392
393                 public static string Normalize (string source, int type)
394                 {
395                         switch (type) {
396                         default:
397                         case 2:
398                                 return Compose (source, type);
399                         case 1:
400                                 return Decompose (source, type);
401                         }
402                 }
403
404                 static byte* props;
405                 static int* mappedChars;
406                 static short* charMapIndex;
407                 static short* helperIndex;
408                 static ushort* mapIdxToComposite;
409                 static byte* combiningClass;
410
411 #if GENERATE_TABLE
412
413                 public static readonly bool IsReady = true; // always
414
415                 static Normalization ()
416                 {
417                         fixed (byte* tmp = propsArr) {
418                                 props = tmp;
419                         }
420                         fixed (int* tmp = mappedCharsArr) {
421                                 mappedChars = tmp;
422                         }
423                         fixed (short* tmp = charMapIndexArr) {
424                                 charMapIndex = tmp;
425                         }
426                         fixed (short* tmp = helperIndexArr) {
427                                 helperIndex = tmp;
428                         }
429                         fixed (ushort* tmp = mapIdxToCompositeArr) {
430                                 mapIdxToComposite = tmp;
431                         }
432                         fixed (byte* tmp = combiningClassArr) {
433                                 combiningClass = tmp;
434                         }
435                 }
436 #else
437
438                 static object forLock = new object ();
439                 public static readonly bool isReady;
440
441                 public static bool IsReady {
442                         get { return isReady; }
443                 }
444
445                 [MethodImpl (MethodImplOptions.InternalCall)]
446                 static extern void load_normalization_resource (
447                         out IntPtr props, out IntPtr mappedChars,
448                         out IntPtr charMapIndex, out IntPtr helperIndex,
449                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
450
451                 static Normalization ()
452                 {
453                         IntPtr p1, p2, p3, p4, p5, p6;
454                         lock (forLock) {
455                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
456                                 props = (byte*) p1;
457                                 mappedChars = (int*) p2;
458                                 charMapIndex = (short*) p3;
459                                 helperIndex = (short*) p4;
460                                 mapIdxToComposite = (ushort*) p5;
461                                 combiningClass = (byte*) p6;
462                         }
463
464                         isReady = true;
465                 }
466         }
467 }
468 #endif
469
470                 //
471                 // autogenerated code or icall to fill array runs here
472                 //
473