2009-09-18 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace Mono.Globalization.Unicode
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static int GetNormalizedStringLength (int ch)
42                 {
43                         int start = charMapIndex [NUtil.MapIdx (ch)];
44                         int i = start;
45                         while (mappedChars [i] != 0)
46                                 i++;
47                         return i - start;
48                 }
49
50                 static byte GetCombiningClass (int c)
51                 {
52                         return combiningClass [NUtil.Combining.ToIndex (c)];
53                 }
54
55                 static int GetPrimaryCompositeFromMapIndex (int src)
56                 {
57                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
58                 }
59
60                 static int GetPrimaryCompositeHelperIndex (int cp)
61                 {
62                         return helperIndex [NUtil.Helper.ToIndex (cp)];
63                 }
64
65                 static int GetPrimaryCompositeCharIndex (object chars, int start)
66                 {
67                         string s = chars as string;
68                         StringBuilder sb = chars as StringBuilder;
69                         char startCh = s != null ? s [start] : sb [start];
70                         int charsLength = sb != null ? sb.Length : s.Length;
71
72                         int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
73                         if (idx == 0)
74                                 return 0;
75                         while (mappedChars [idx] == startCh) {
76                                 int prevCB = 0;
77                                 int combiningClass = 0;
78                                 for (int i = 1, j = 1; ; i++, j++) {
79                                         prevCB = combiningClass;
80
81                                         if (mappedChars [idx + i] == 0)
82                                                 // matched
83                                                 return idx;
84                                         if (start + i >= charsLength)
85                                                 return 0; // didn't match
86
87                                         // handle blocked characters here.
88                                         char curCh;
89                                         bool match = false;
90                                         do {
91                                                 curCh = s != null ?
92                                                         s [start + j] :
93                                                         sb [start + j];
94                                                 combiningClass = GetCombiningClass (curCh);
95                                                 if (mappedChars [idx + i] == curCh) {
96                                                         match = true;
97                                                         break;
98                                                 }
99                                                 if (combiningClass < prevCB) // blocked. Give up this map entry.
100                                                         break;
101                                                 if (++j + start >= charsLength || combiningClass == 0)
102                                                         break;
103                                         } while (true);
104
105                                         if (match)
106                                                 continue; // check next character in the current map entry string.
107                                         if (prevCB < combiningClass) {
108                                                 j--;
109                                                 if (mappedChars [idx + i] == curCh)
110                                                         continue;
111                                                 //if (mappedChars [idx + i] > curCh)
112                                                 //      return 0; // no match
113                                         }
114                                         // otherwise move idx to next item
115                                         while (mappedChars [i] != 0)
116                                                 i++;
117                                         idx += i + 1;
118                                         break;
119                                 }
120                         }
121                         // reached to end of entries
122                         return 0;
123                 }
124
125                 private static string Compose (string source, int checkType)
126                 {
127                         StringBuilder sb = null;
128                         Decompose (source, ref sb, checkType);
129                         if (sb == null)
130                                 sb = Combine (source, 0, checkType);
131                         else
132                                 Combine (sb, 0, checkType);
133
134                         return sb != null ? sb.ToString () : source;
135                 }
136
137                 private static StringBuilder Combine (string source, int start, int checkType)
138                 {
139                         for (int i = 0; i < source.Length; i++) {
140                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
141                                         continue;
142                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
143                                 sb.Append (source);
144                                 Combine (sb, i, checkType);
145                                 return sb;
146                         }
147                         return null;
148                 }
149
150                 private static bool CanBePrimaryComposite (int i)
151                 {
152                         if (i >= 0x3400 && i <= 0x9FBB)
153                                 return GetPrimaryCompositeHelperIndex (i) != 0;
154                         return (PropValue (i) & IsUnsafe) != 0;
155                 }
156
157                 private static void Combine (StringBuilder sb, int start, int checkType)
158                 {
159                         for (int i = start; i < sb.Length; i++) {
160                                 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes)
161                                         continue;
162
163                                 int cur = i;
164                                 // FIXME: It should check "blocked" too
165                                 for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
166                                         if (GetCombiningClass ((int) sb [i]) == 0)
167                                                 break;
168
169                                 int idx = 0; // index to mappedChars
170                                 for (; i < cur; i++) {
171                                         idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
172                                         if (idx > 0)
173                                                 break;
174                                 }
175                                 if (idx == 0) {
176                                         i = cur;
177                                         continue;
178                                 }
179
180                                 int prim = GetPrimaryCompositeFromMapIndex (idx);
181                                 int len = GetNormalizedStringLength (prim);
182                                 if (prim == 0 || len == 0)
183                                         throw new SystemException ("Internal error: should not happen. Input: " + sb);
184                                 int removed = 0;
185                                 sb.Insert (i++, (char) prim); // always single character
186
187                                 // handle blocked characters here.
188                                 while (removed < len) {
189                                         if (sb [i] == mappedChars [idx + removed]) {
190                                                 sb.Remove (i, 1);
191                                                 removed++;
192                                                 // otherwise, skip it.
193                                         }
194                                         else
195                                                 i++;
196                                 }
197                                 i = cur - 1;
198                         }
199                 }
200
201                 static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
202                 {
203                         if ((PropValue (cur) & FullCompositionExclusion) != 0)
204                                 return 0;
205                         if (GetCombiningClass (cur) != 0)
206                                 return 0; // not a starter
207                         return GetPrimaryCompositeCharIndex (o, bufferPos);
208                 }
209
210                 static string Decompose (string source, int checkType)
211                 {
212                         StringBuilder sb = null;
213                         Decompose (source, ref sb, checkType);
214                         return sb != null ? sb.ToString () : source;
215                 }
216
217                 static void Decompose (string source,
218                         ref StringBuilder sb, int checkType)
219                 {
220                         int [] buf = null;
221                         int start = 0;
222                         for (int i = 0; i < source.Length; i++)
223                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
224                                         DecomposeChar (ref sb, ref buf, source,
225                                                 i, ref start);
226                         if (sb != null)
227                                 sb.Append (source, start, source.Length - start);
228                         ReorderCanonical (source, ref sb, 1);
229                 }
230
231                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
232                 {
233                         if (sb == null) {
234                                 // check only with src.
235                                 for (int i = 1; i < src.Length; i++) {
236                                         int level = GetCombiningClass (src [i]);
237                                         if (level == 0)
238                                                 continue;
239                                         if (GetCombiningClass (src [i - 1]) > level) {
240                                                 sb = new StringBuilder (src.Length);
241                                                 sb.Append (src, 0, src.Length);
242                                                 ReorderCanonical (src, ref sb, i);
243                                                 return;
244                                         }
245                                 }
246                                 return;
247                         }
248                         // check only with sb
249                         for (int i = start; i < sb.Length; i++) {
250                                 int level = GetCombiningClass (sb [i]);
251                                 if (level == 0)
252                                         continue;
253                                 if (GetCombiningClass (sb [i - 1]) > level) {
254                                         char c = sb [i - 1];
255                                         sb [i - 1] = sb [i];
256                                         sb [i] = c;
257                                         i--; // apply recursively
258                                 }
259                         }
260                 }
261
262                 static void DecomposeChar (ref StringBuilder sb,
263                         ref int [] buf, string s, int i, ref int start)
264                 {
265                         if (sb == null)
266                                 sb = new StringBuilder (s.Length + 100);
267                         sb.Append (s, start, i - start);
268                         if (buf == null)
269                                 buf = new int [19];
270                         GetCanonical (s [i], buf, 0);
271                         for (int x = 0; ; x++) {
272                                 if (buf [x] == 0)
273                                         break;
274                                 if (buf [x] < char.MaxValue)
275                                         sb.Append ((char) buf [x]);
276                                 else { // surrogate
277                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
278                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
279                                 }
280                         }
281                         start = i + 1;
282                 }
283
284                 public static NormalizationCheck QuickCheck (char c, int type)
285                 {
286                         uint v;
287                         switch (type) {
288                         default: // NFC
289                                 v = PropValue ((int) c);
290                                 return (v & NoNfc) == 0 ?
291                                         (v & MaybeNfc) == 0 ?
292                                         NormalizationCheck.Yes :
293                                         NormalizationCheck.Maybe :
294                                         NormalizationCheck.No;
295                         case 1: // NFD
296                                 if ('\uAC00' <= c && c <= '\uD7A3')
297                                         return NormalizationCheck.No;
298                                 return (PropValue ((int) c) & NoNfd) != 0 ?
299                                         NormalizationCheck.No : NormalizationCheck.Yes;
300                         case 2: // NFKC
301                                 v = PropValue ((int) c);
302                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
303                                         (v & MaybeNfkc) != 0 ?
304                                         NormalizationCheck.Maybe :
305                                         NormalizationCheck.Yes;
306                         case 3: // NFKD
307                                 if ('\uAC00' <= c && c <= '\uD7A3')
308                                         return NormalizationCheck.No;
309                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
310                                         NormalizationCheck.No : NormalizationCheck.Yes;
311                         }
312                 }
313
314                 /* for now we don't use FC_NFKC closure
315                 public static bool IsMultiForm (char c)
316                 {
317                         return (PropValue ((int) c) & 0xF0000000) != 0;
318                 }
319
320                 public static char SingleForm (char c)
321                 {
322                         uint v = PropValue ((int) c);
323                         int idx = (int) ((v & 0x7FFF0000) >> 16);
324                         return (char) singleNorm [idx];
325                 }
326
327                 public static void MultiForm (char c, char [] buf, int index)
328                 {
329                         // FIXME: handle surrogate
330                         uint v = PropValue ((int) c);
331                         int midx = (int) ((v & 0x7FFF0000) >> 16);
332                         buf [index] = (char) multiNorm [midx];
333                         buf [index + 1] = (char) multiNorm [midx + 1];
334                         buf [index + 2] = (char) multiNorm [midx + 2];
335                         buf [index + 3] = (char) multiNorm [midx + 3];
336                         if (buf [index + 3] != 0)
337                                 buf [index + 4] = (char) 0; // zero termination
338                 }
339                 */
340
341                 public static void GetCanonical (int c, int [] buf, int bufIdx)
342                 {
343                         for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
344                                 buf [bufIdx++] = mappedChars [i];
345                         buf [bufIdx] = (char) 0;
346                 }
347
348                 public static bool IsNormalized (string source, int type)
349                 {
350                         int prevCC = -1;
351                         for (int i = 0; i < source.Length; i++) {
352                                 int cc = GetCombiningClass (source [i]);
353                                 if (cc != 0 && cc < prevCC)
354                                         return false;
355                                 prevCC = cc;
356                                 switch (QuickCheck (source [i], type)) {
357                                 case NormalizationCheck.Yes:
358                                         break;
359                                 case NormalizationCheck.No:
360                                         return false;
361                                 case NormalizationCheck.Maybe:
362                                         // for those forms with composition, it cannot be checked here
363                                         switch (type) {
364                                         case 0: // NFC
365                                         case 2: // NFKC
366                                                 return source == Normalize (source, type);
367                                         }
368                                         // go on...
369                                         
370                                         // partly copied from Combine()
371                                         int cur = i;
372                                         for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
373                                                 if (GetCombiningClass ((int) source [i]) == 0)
374                                                         break;
375                                         //i++;
376                                         // Now i is the "starter"
377                                         for (; i < cur; i++) {
378                                                 if (GetPrimaryCompositeCharIndex (source, i) != 0)
379                                                         return false;
380                                         }
381                                         break;
382                                 }
383                         }
384                         return true;
385                 }
386
387                 public static string Normalize (string source, int type)
388                 {
389                         switch (type) {
390                         default:
391                         case 2:
392                                 return Compose (source, type);
393                         case 1:
394                         case 3:
395                                 return Decompose (source, type);
396                         }
397                 }
398
399                 static byte* props;
400                 static int* mappedChars;
401                 static short* charMapIndex;
402                 static short* helperIndex;
403                 static ushort* mapIdxToComposite;
404                 static byte* combiningClass;
405
406 #if GENERATE_TABLE
407
408                 public static readonly bool IsReady = true; // always
409
410                 static Normalization ()
411                 {
412                         fixed (byte* tmp = propsArr) {
413                                 props = tmp;
414                         }
415                         fixed (int* tmp = mappedCharsArr) {
416                                 mappedChars = tmp;
417                         }
418                         fixed (short* tmp = charMapIndexArr) {
419                                 charMapIndex = tmp;
420                         }
421                         fixed (short* tmp = helperIndexArr) {
422                                 helperIndex = tmp;
423                         }
424                         fixed (ushort* tmp = mapIdxToCompositeArr) {
425                                 mapIdxToComposite = tmp;
426                         }
427                         fixed (byte* tmp = combiningClassArr) {
428                                 combiningClass = tmp;
429                         }
430                 }
431 #else
432
433                 static object forLock = new object ();
434                 public static readonly bool isReady;
435
436                 public static bool IsReady {
437                         get { return isReady; }
438                 }
439
440                 [MethodImpl (MethodImplOptions.InternalCall)]
441                 static extern void load_normalization_resource (
442                         out IntPtr props, out IntPtr mappedChars,
443                         out IntPtr charMapIndex, out IntPtr helperIndex,
444                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
445
446                 static Normalization ()
447                 {
448                         IntPtr p1, p2, p3, p4, p5, p6;
449                         lock (forLock) {
450                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
451                                 props = (byte*) p1;
452                                 mappedChars = (int*) p2;
453                                 charMapIndex = (short*) p3;
454                                 helperIndex = (short*) p4;
455                                 mapIdxToComposite = (ushort*) p5;
456                                 combiningClass = (byte*) p6;
457                         }
458
459                         isReady = true;
460                 }
461         }
462 }
463 #endif
464
465                 //
466                 // autogenerated code or icall to fill array runs here
467                 //
468