2010-03-12 Jb Evain <jbevain@novell.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace Mono.Globalization.Unicode
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static int GetNormalizedStringLength (int ch)
42                 {
43                         int start = charMapIndex [NUtil.MapIdx (ch)];
44                         int i = start;
45                         while (mappedChars [i] != 0)
46                                 i++;
47                         return i - start;
48                 }
49
50                 static byte GetCombiningClass (int c)
51                 {
52                         return combiningClass [NUtil.Combining.ToIndex (c)];
53                 }
54
55                 static int GetPrimaryCompositeFromMapIndex (int src)
56                 {
57                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
58                 }
59
60                 static int GetPrimaryCompositeHelperIndex (int cp)
61                 {
62                         return helperIndex [NUtil.Helper.ToIndex (cp)];
63                 }
64
65                 static int GetPrimaryCompositeCharIndex (object chars, int start)
66                 {
67                         string s = chars as string;
68                         StringBuilder sb = chars as StringBuilder;
69                         char startCh = s != null ? s [start] : sb [start];
70                         int charsLength = sb != null ? sb.Length : s.Length;
71
72                         int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
73                         if (idx == 0)
74                                 return 0;
75                         while (mappedChars [idx] == startCh) {
76                                 int prevCB = 0;
77                                 int combiningClass = 0;
78                                 for (int i = 1, j = 1; ; i++, j++) {
79                                         prevCB = combiningClass;
80
81                                         if (mappedChars [idx + i] == 0)
82                                                 // matched
83                                                 return idx;
84                                         if (start + i >= charsLength)
85                                                 return 0; // didn't match
86
87                                         // handle blocked characters here.
88                                         char curCh;
89                                         bool match = false;
90                                         do {
91                                                 curCh = s != null ?
92                                                         s [start + j] :
93                                                         sb [start + j];
94                                                 combiningClass = GetCombiningClass (curCh);
95                                                 if (mappedChars [idx + i] == curCh) {
96                                                         match = true;
97                                                         break;
98                                                 }
99                                                 if (combiningClass < prevCB) // blocked. Give up this map entry.
100                                                         break;
101                                                 if (++j + start >= charsLength || combiningClass == 0)
102                                                         break;
103                                         } while (true);
104
105                                         if (match)
106                                                 continue; // check next character in the current map entry string.
107                                         if (prevCB < combiningClass) {
108                                                 j--;
109                                                 if (mappedChars [idx + i] == curCh)
110                                                         continue;
111                                                 //if (mappedChars [idx + i] > curCh)
112                                                 //      return 0; // no match
113                                         }
114                                         // otherwise move idx to next item
115                                         while (mappedChars [i] != 0)
116                                                 i++;
117                                         idx += i + 1;
118                                         break;
119                                 }
120                         }
121                         // reached to end of entries
122                         return 0;
123                 }
124
125                 private static string Compose (string source, int checkType)
126                 {
127                         StringBuilder sb = null;
128                         Decompose (source, ref sb, checkType);
129                         if (sb == null)
130                                 sb = Combine (source, 0, checkType);
131                         else
132                                 Combine (sb, 0, checkType);
133
134                         return sb != null ? sb.ToString () : source;
135                 }
136
137                 private static StringBuilder Combine (string source, int start, int checkType)
138                 {
139                         for (int i = 0; i < source.Length; i++) {
140                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
141                                         continue;
142                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
143                                 sb.Append (source);
144                                 Combine (sb, i, checkType);
145                                 return sb;
146                         }
147                         return null;
148                 }
149
150 /*
151                 private static bool CanBePrimaryComposite (int i)
152                 {
153                         if (i >= 0x3400 && i <= 0x9FBB)
154                                 return GetPrimaryCompositeHelperIndex (i) != 0;
155                         return (PropValue (i) & IsUnsafe) != 0;
156                 }
157 */
158                 private static void Combine (StringBuilder sb, int start, int checkType)
159                 {
160                         for (int i = start; i < sb.Length; i++) {
161                                 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes)
162                                         continue;
163
164                                 int cur = i;
165                                 // FIXME: It should check "blocked" too
166                                 for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
167                                         if (GetCombiningClass ((int) sb [i]) == 0)
168                                                 break;
169
170                                 int idx = 0; // index to mappedChars
171                                 for (; i < cur; i++) {
172                                         idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
173                                         if (idx > 0)
174                                                 break;
175                                 }
176                                 if (idx == 0) {
177                                         i = cur;
178                                         continue;
179                                 }
180
181                                 int prim = GetPrimaryCompositeFromMapIndex (idx);
182                                 int len = GetNormalizedStringLength (prim);
183                                 if (prim == 0 || len == 0)
184                                         throw new SystemException ("Internal error: should not happen. Input: " + sb);
185                                 int removed = 0;
186                                 sb.Insert (i++, (char) prim); // always single character
187
188                                 // handle blocked characters here.
189                                 while (removed < len) {
190                                         if (sb [i] == mappedChars [idx + removed]) {
191                                                 sb.Remove (i, 1);
192                                                 removed++;
193                                                 // otherwise, skip it.
194                                         }
195                                         else
196                                                 i++;
197                                 }
198                                 i = cur - 1;
199                         }
200                 }
201
202                 static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
203                 {
204                         if ((PropValue (cur) & FullCompositionExclusion) != 0)
205                                 return 0;
206                         if (GetCombiningClass (cur) != 0)
207                                 return 0; // not a starter
208                         return GetPrimaryCompositeCharIndex (o, bufferPos);
209                 }
210
211                 static string Decompose (string source, int checkType)
212                 {
213                         StringBuilder sb = null;
214                         Decompose (source, ref sb, checkType);
215                         return sb != null ? sb.ToString () : source;
216                 }
217
218                 static void Decompose (string source,
219                         ref StringBuilder sb, int checkType)
220                 {
221                         int [] buf = null;
222                         int start = 0;
223                         for (int i = 0; i < source.Length; i++)
224                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
225                                         DecomposeChar (ref sb, ref buf, source,
226                                                 i, ref start);
227                         if (sb != null)
228                                 sb.Append (source, start, source.Length - start);
229                         ReorderCanonical (source, ref sb, 1);
230                 }
231
232                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
233                 {
234                         if (sb == null) {
235                                 // check only with src.
236                                 for (int i = 1; i < src.Length; i++) {
237                                         int level = GetCombiningClass (src [i]);
238                                         if (level == 0)
239                                                 continue;
240                                         if (GetCombiningClass (src [i - 1]) > level) {
241                                                 sb = new StringBuilder (src.Length);
242                                                 sb.Append (src, 0, src.Length);
243                                                 ReorderCanonical (src, ref sb, i);
244                                                 return;
245                                         }
246                                 }
247                                 return;
248                         }
249                         // check only with sb
250                         for (int i = start; i < sb.Length; i++) {
251                                 int level = GetCombiningClass (sb [i]);
252                                 if (level == 0)
253                                         continue;
254                                 if (GetCombiningClass (sb [i - 1]) > level) {
255                                         char c = sb [i - 1];
256                                         sb [i - 1] = sb [i];
257                                         sb [i] = c;
258                                         i--; // apply recursively
259                                 }
260                         }
261                 }
262
263                 static void DecomposeChar (ref StringBuilder sb,
264                         ref int [] buf, string s, int i, ref int start)
265                 {
266                         if (sb == null)
267                                 sb = new StringBuilder (s.Length + 100);
268                         sb.Append (s, start, i - start);
269                         if (buf == null)
270                                 buf = new int [19];
271                         GetCanonical (s [i], buf, 0);
272                         for (int x = 0; ; x++) {
273                                 if (buf [x] == 0)
274                                         break;
275                                 if (buf [x] < char.MaxValue)
276                                         sb.Append ((char) buf [x]);
277                                 else { // surrogate
278                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
279                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
280                                 }
281                         }
282                         start = i + 1;
283                 }
284
285                 public static NormalizationCheck QuickCheck (char c, int type)
286                 {
287                         uint v;
288                         switch (type) {
289                         default: // NFC
290                                 v = PropValue ((int) c);
291                                 return (v & NoNfc) == 0 ?
292                                         (v & MaybeNfc) == 0 ?
293                                         NormalizationCheck.Yes :
294                                         NormalizationCheck.Maybe :
295                                         NormalizationCheck.No;
296                         case 1: // NFD
297                                 if ('\uAC00' <= c && c <= '\uD7A3')
298                                         return NormalizationCheck.No;
299                                 return (PropValue ((int) c) & NoNfd) != 0 ?
300                                         NormalizationCheck.No : NormalizationCheck.Yes;
301                         case 2: // NFKC
302                                 v = PropValue ((int) c);
303                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
304                                         (v & MaybeNfkc) != 0 ?
305                                         NormalizationCheck.Maybe :
306                                         NormalizationCheck.Yes;
307                         case 3: // NFKD
308                                 if ('\uAC00' <= c && c <= '\uD7A3')
309                                         return NormalizationCheck.No;
310                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
311                                         NormalizationCheck.No : NormalizationCheck.Yes;
312                         }
313                 }
314
315                 /* for now we don't use FC_NFKC closure
316                 public static bool IsMultiForm (char c)
317                 {
318                         return (PropValue ((int) c) & 0xF0000000) != 0;
319                 }
320
321                 public static char SingleForm (char c)
322                 {
323                         uint v = PropValue ((int) c);
324                         int idx = (int) ((v & 0x7FFF0000) >> 16);
325                         return (char) singleNorm [idx];
326                 }
327
328                 public static void MultiForm (char c, char [] buf, int index)
329                 {
330                         // FIXME: handle surrogate
331                         uint v = PropValue ((int) c);
332                         int midx = (int) ((v & 0x7FFF0000) >> 16);
333                         buf [index] = (char) multiNorm [midx];
334                         buf [index + 1] = (char) multiNorm [midx + 1];
335                         buf [index + 2] = (char) multiNorm [midx + 2];
336                         buf [index + 3] = (char) multiNorm [midx + 3];
337                         if (buf [index + 3] != 0)
338                                 buf [index + 4] = (char) 0; // zero termination
339                 }
340                 */
341
342                 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
343                                   HangulVBase = 0x1161, HangulTBase = 0x11A7,
344                                   HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
345                                   HangulNCount = HangulVCount * HangulTCount,   // 588
346                                   HangulSCount = HangulLCount * HangulNCount;   // 11172
347
348                 private static bool GetCanonicalHangul (int s, int [] buf, int bufIdx)
349                 {
350                         int idx = s - HangulSBase;
351                         if (idx < 0 || idx >= HangulSCount) {
352                                 return false;
353                         }
354
355                         int L = HangulLBase + idx / HangulNCount;
356                         int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
357                         int T = HangulTBase + idx % HangulTCount;
358
359                         buf [bufIdx++] = L;
360                         buf [bufIdx++] = V;
361                         if (T != HangulTBase) {
362                                 buf [bufIdx++] = T;
363                         }
364                         buf [bufIdx] = (char) 0;
365                         return true;
366                 }
367
368                 public static void GetCanonical (int c, int [] buf, int bufIdx)
369                 {
370                         if (!GetCanonicalHangul (c, buf, bufIdx)) {
371                                 for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
372                                         buf [bufIdx++] = mappedChars [i];
373                                 buf [bufIdx] = (char) 0;
374                         }
375                 }
376
377                 public static bool IsNormalized (string source, int type)
378                 {
379                         int prevCC = -1;
380                         for (int i = 0; i < source.Length; i++) {
381                                 int cc = GetCombiningClass (source [i]);
382                                 if (cc != 0 && cc < prevCC)
383                                         return false;
384                                 prevCC = cc;
385                                 switch (QuickCheck (source [i], type)) {
386                                 case NormalizationCheck.Yes:
387                                         break;
388                                 case NormalizationCheck.No:
389                                         return false;
390                                 case NormalizationCheck.Maybe:
391                                         // for those forms with composition, it cannot be checked here
392                                         switch (type) {
393                                         case 0: // NFC
394                                         case 2: // NFKC
395                                                 return source == Normalize (source, type);
396                                         }
397                                         // go on...
398                                         
399                                         // partly copied from Combine()
400                                         int cur = i;
401                                         for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
402                                                 if (GetCombiningClass ((int) source [i]) == 0)
403                                                         break;
404                                         //i++;
405                                         // Now i is the "starter"
406                                         for (; i < cur; i++) {
407                                                 if (GetPrimaryCompositeCharIndex (source, i) != 0)
408                                                         return false;
409                                         }
410                                         break;
411                                 }
412                         }
413                         return true;
414                 }
415
416                 public static string Normalize (string source, int type)
417                 {
418                         switch (type) {
419                         default:
420                         case 2:
421                                 return Compose (source, type);
422                         case 1:
423                         case 3:
424                                 return Decompose (source, type);
425                         }
426                 }
427
428                 static byte* props;
429                 static int* mappedChars;
430                 static short* charMapIndex;
431                 static short* helperIndex;
432                 static ushort* mapIdxToComposite;
433                 static byte* combiningClass;
434
435 #if GENERATE_TABLE
436
437                 public static readonly bool IsReady = true; // always
438
439                 static Normalization ()
440                 {
441                         fixed (byte* tmp = propsArr) {
442                                 props = tmp;
443                         }
444                         fixed (int* tmp = mappedCharsArr) {
445                                 mappedChars = tmp;
446                         }
447                         fixed (short* tmp = charMapIndexArr) {
448                                 charMapIndex = tmp;
449                         }
450                         fixed (short* tmp = helperIndexArr) {
451                                 helperIndex = tmp;
452                         }
453                         fixed (ushort* tmp = mapIdxToCompositeArr) {
454                                 mapIdxToComposite = tmp;
455                         }
456                         fixed (byte* tmp = combiningClassArr) {
457                                 combiningClass = tmp;
458                         }
459                 }
460 #else
461
462                 static object forLock = new object ();
463                 public static readonly bool isReady;
464
465                 public static bool IsReady {
466                         get { return isReady; }
467                 }
468
469                 [MethodImpl (MethodImplOptions.InternalCall)]
470                 static extern void load_normalization_resource (
471                         out IntPtr props, out IntPtr mappedChars,
472                         out IntPtr charMapIndex, out IntPtr helperIndex,
473                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
474
475                 static Normalization ()
476                 {
477                         IntPtr p1, p2, p3, p4, p5, p6;
478                         lock (forLock) {
479                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
480                                 props = (byte*) p1;
481                                 mappedChars = (int*) p2;
482                                 charMapIndex = (short*) p3;
483                                 helperIndex = (short*) p4;
484                                 mapIdxToComposite = (ushort*) p5;
485                                 combiningClass = (byte*) p6;
486                         }
487
488                         isReady = true;
489                 }
490         }
491 }
492 #endif
493
494                 //
495                 // autogenerated code or icall to fill array runs here
496                 //
497