2009-07-11 Michael Barker <mike@middlesoft.co.uk>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
5
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
7
8 namespace Mono.Globalization.Unicode
9 {
10         internal enum NormalizationCheck {
11                 Yes,
12                 No,
13                 Maybe
14         }
15
16         internal unsafe class Normalization
17         {
18                 public const int NoNfd = 1;
19                 public const int NoNfkd = 2;
20                 public const int NoNfc = 4;
21                 public const int MaybeNfc = 8;
22                 public const int NoNfkc = 16;
23                 public const int MaybeNfkc = 32;
24                 public const int FullCompositionExclusion = 64;
25                 public const int IsUnsafe = 128;
26 //              public const int ExpandOnNfd = 256;
27 //              public const int ExpandOnNfc = 512;
28 //              public const int ExpandOnNfkd = 1024;
29 //              public const int ExpandOnNfkc = 2048;
30
31                 static uint PropValue (int cp)
32                 {
33                         return props [NUtil.PropIdx (cp)];
34                 }
35
36                 static int CharMapIdx (int cp)
37                 {
38                         return charMapIndex [NUtil.MapIdx (cp)];
39                 }
40
41                 static int GetComposedStringLength (int ch)
42                 {
43                         int start = charMapIndex [NUtil.MapIdx (ch)];
44                         int i = start;
45                         while (mappedChars [i] != 0)
46                                 i++;
47                         return i - start;
48                 }
49
50                 static byte GetCombiningClass (int c)
51                 {
52                         return combiningClass [NUtil.Combining.ToIndex (c)];
53                 }
54
55                 static int GetPrimaryCompositeFromMapIndex (int src)
56                 {
57                         return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
58                 }
59
60                 static int GetPrimaryCompositeHelperIndex (int cp)
61                 {
62                         return helperIndex [NUtil.Helper.ToIndex (cp)];
63                 }
64
65                 static int GetPrimaryCompositeCharIndex (object chars, int start)
66                 {
67                         string s = chars as string;
68                         StringBuilder sb = chars as StringBuilder;
69                         char startCh = s != null ? s [start] : sb [start];
70                         int charsLength = sb != null ? sb.Length : s.Length;
71
72                         int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
73                         if (idx == 0)
74                                 return 0;
75                         while (mappedChars [idx] == startCh) {
76                                 for (int i = 1, j = 1; ; i++, j++) {
77                                         if (mappedChars [idx + i] == 0)
78                                                 // matched
79                                                 return idx;
80                                         if (start + i >= charsLength)
81                                                 return 0; // didn't match
82
83                                         // handle blocked characters here.
84                                         char curCh;
85                                         int combiningClass;
86                                         int nextCB = 0;
87                                         do {
88                                                 curCh = s != null ?
89                                                         s [start + j] :
90                                                         sb [start + j];
91                                                 combiningClass = GetCombiningClass (curCh);
92                                                 if (++j + start >= charsLength ||
93                                                         combiningClass == 0)
94                                                         break;
95                                                 nextCB = GetCombiningClass (
96                                                         s != null ?
97                                                         s [start + j] :
98                                                         sb [start + j]);
99                                         } while (nextCB > 0 && combiningClass >= nextCB);
100                                         j--;
101                                         if (mappedChars [idx + i] == curCh)
102                                                 continue;
103                                         if (mappedChars [idx + i] > curCh)
104                                                 return 0; // no match
105                                         // otherwise move idx to next item
106                                         while (mappedChars [i] != 0)
107                                                 i++;
108                                         idx += i + 1;
109                                         break;
110                                 }
111                         }
112                         // reached to end of entries
113                         return 0;
114                 }
115
116                 private static string Compose (string source, int checkType)
117                 {
118                         StringBuilder sb = null;
119                         Decompose (source, ref sb, checkType);
120                         if (sb == null)
121                                 sb = Combine (source, 0, checkType);
122                         else
123                                 Combine (sb, 0, checkType);
124
125                         return sb != null ? sb.ToString () : source;
126                 }
127
128                 private static StringBuilder Combine (string source, int start, int checkType)
129                 {
130                         for (int i = 0; i < source.Length; i++) {
131                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
132                                         continue;
133                                 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
134                                 sb.Append (source);
135                                 Combine (sb, i, checkType);
136                                 return sb;
137                         }
138                         return null;
139                 }
140
141                 private static bool CanBePrimaryComposite (int i)
142                 {
143                         if (i >= 0x3400 && i <= 0x9FBB)
144                                 return GetPrimaryCompositeHelperIndex (i) != 0;
145                         return (PropValue (i) & IsUnsafe) != 0;
146                 }
147
148                 private static void Combine (StringBuilder sb, int start, int checkType)
149                 {
150                         for (int i = start; i < sb.Length; i++) {
151                                 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes)
152                                         continue;
153
154                                 int cur = i;
155                                 // FIXME: It should check "blocked" too
156                                 for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
157                                         if (!CanBePrimaryComposite ((int) sb [i]))
158                                                 break;
159
160                                 int idx = 0;
161                                 for (; i < cur; i++) {
162                                         idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
163                                         if (idx > 0)
164                                                 break;
165                                 }
166                                 if (idx == 0) {
167                                         i = cur;
168                                         continue;
169                                 }
170                                 int ch = GetPrimaryCompositeFromMapIndex (idx);
171                                 int len = GetComposedStringLength (ch);
172                                 if (ch == 0 || len == 0)
173                                         throw new SystemException ("Internal error: should not happen.");
174                                 int removed = 0;
175                                 sb.Insert (i++, (char) ch); // always single character
176
177                                 // handle blocked characters here.
178                                 while (removed < len) {
179                                         if (i + 1 < sb.Length) {
180                                                 int cb = GetCombiningClass (sb [i]);
181                                                 if (cb > 0) {
182                                                         int next = GetCombiningClass (sb [i + 1]);
183                                                         if (next != 0 && cb >= next) {
184                                                                 i++;
185                                                                 continue;
186                                                         }
187                                                 }
188                                         }
189                                         sb.Remove (i, 1);
190                                         removed++;
191                                 }
192                                 i = cur - 1; // apply recursively
193                         }
194                 }
195
196                 static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
197                 {
198                         if ((PropValue (cur) & FullCompositionExclusion) != 0)
199                                 return 0;
200                         if (GetCombiningClass (cur) != 0)
201                                 return 0; // not a starter
202                         return GetPrimaryCompositeCharIndex (o, bufferPos);
203                 }
204
205                 static string Decompose (string source, int checkType)
206                 {
207                         StringBuilder sb = null;
208                         Decompose (source, ref sb, checkType);
209                         return sb != null ? sb.ToString () : source;
210                 }
211
212                 static void Decompose (string source,
213                         ref StringBuilder sb, int checkType)
214                 {
215                         int [] buf = null;
216                         int start = 0;
217                         for (int i = 0; i < source.Length; i++)
218                                 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
219                                         DecomposeChar (ref sb, ref buf, source,
220                                                 i, ref start);
221                         if (sb != null)
222                                 sb.Append (source, start, source.Length - start);
223                         ReorderCanonical (source, ref sb, 1);
224                 }
225
226                 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
227                 {
228                         if (sb == null) {
229                                 // check only with src.
230                                 for (int i = 1; i < src.Length; i++) {
231                                         int level = GetCombiningClass (src [i]);
232                                         if (level == 0)
233                                                 continue;
234                                         if (GetCombiningClass (src [i - 1]) > level) {
235                                                 sb = new StringBuilder (src.Length);
236                                                 sb.Append (src, 0, src.Length);
237                                                 ReorderCanonical (src, ref sb, i);
238                                                 return;
239                                         }
240                                 }
241                                 return;
242                         }
243                         // check only with sb
244                         for (int i = start; i < sb.Length; i++) {
245                                 int level = GetCombiningClass (sb [i]);
246                                 if (level == 0)
247                                         continue;
248                                 if (GetCombiningClass (sb [i - 1]) > level) {
249                                         char c = sb [i - 1];
250                                         sb [i - 1] = sb [i];
251                                         sb [i] = c;
252                                         i--; // apply recursively
253                                 }
254                         }
255                 }
256
257                 static void DecomposeChar (ref StringBuilder sb,
258                         ref int [] buf, string s, int i, ref int start)
259                 {
260                         if (sb == null)
261                                 sb = new StringBuilder (s.Length + 100);
262                         sb.Append (s, start, i - start);
263                         if (buf == null)
264                                 buf = new int [19];
265                         GetCanonical (s [i], buf, 0);
266                         for (int x = 0; ; x++) {
267                                 if (buf [x] == 0)
268                                         break;
269                                 if (buf [x] < char.MaxValue)
270                                         sb.Append ((char) buf [x]);
271                                 else { // surrogate
272                                         sb.Append ((char) (buf [x] >> 10 + 0xD800));
273                                         sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
274                                 }
275                         }
276                         start = i + 1;
277                 }
278
279                 public static NormalizationCheck QuickCheck (char c, int type)
280                 {
281                         uint v;
282                         switch (type) {
283                         default: // NFC
284                                 v = PropValue ((int) c);
285                                 return (v & NoNfc) == 0 ?
286                                         (v & MaybeNfc) == 0 ?
287                                         NormalizationCheck.Yes :
288                                         NormalizationCheck.Maybe :
289                                         NormalizationCheck.No;
290                         case 1: // NFD
291                                 if ('\uAC00' <= c && c <= '\uD7A3')
292                                         return NormalizationCheck.No;
293                                 return (PropValue ((int) c) & NoNfd) != 0 ?
294                                         NormalizationCheck.No : NormalizationCheck.Yes;
295                         case 2: // NFKC
296                                 v = PropValue ((int) c);
297                                 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
298                                         (v & MaybeNfkc) != 0 ?
299                                         NormalizationCheck.Maybe :
300                                         NormalizationCheck.Yes;
301                         case 3: // NFKD
302                                 if ('\uAC00' <= c && c <= '\uD7A3')
303                                         return NormalizationCheck.No;
304                                 return (PropValue ((int) c) & NoNfkd) != 0 ?
305                                         NormalizationCheck.No : NormalizationCheck.Yes;
306                         }
307                 }
308
309                 /* for now we don't use FC_NFKC closure
310                 public static bool IsMultiForm (char c)
311                 {
312                         return (PropValue ((int) c) & 0xF0000000) != 0;
313                 }
314
315                 public static char SingleForm (char c)
316                 {
317                         uint v = PropValue ((int) c);
318                         int idx = (int) ((v & 0x7FFF0000) >> 16);
319                         return (char) singleNorm [idx];
320                 }
321
322                 public static void MultiForm (char c, char [] buf, int index)
323                 {
324                         // FIXME: handle surrogate
325                         uint v = PropValue ((int) c);
326                         int midx = (int) ((v & 0x7FFF0000) >> 16);
327                         buf [index] = (char) multiNorm [midx];
328                         buf [index + 1] = (char) multiNorm [midx + 1];
329                         buf [index + 2] = (char) multiNorm [midx + 2];
330                         buf [index + 3] = (char) multiNorm [midx + 3];
331                         if (buf [index + 3] != 0)
332                                 buf [index + 4] = (char) 0; // zero termination
333                 }
334                 */
335
336                 public static void GetCanonical (int c, int [] buf, int bufIdx)
337                 {
338                         for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
339                                 buf [bufIdx++] = mappedChars [i];
340                         buf [bufIdx] = (char) 0;
341                 }
342
343                 public static bool IsNormalized (string source, int type)
344                 {
345                         int prevCC = -1;
346                         for (int i = 0; i < source.Length; i++) {
347                                 int cc = GetCombiningClass (source [i]);
348                                 if (cc != 0 && cc < prevCC)
349                                         return false;
350                                 prevCC = cc;
351                                 switch (QuickCheck (source [i], type)) {
352                                 case NormalizationCheck.Yes:
353                                         break;
354                                 case NormalizationCheck.No:
355                                         return false;
356                                 case NormalizationCheck.Maybe:
357                                         // for those forms with composition, it cannot be checked here
358                                         switch (type) {
359                                         case 0: // NFC
360                                         case 2: // NFKC
361                                                 return source == Normalize (source, type);
362                                         }
363                                         // go on...
364                                         
365                                         // partly copied from Combine()
366                                         int cur = i;
367                                         // FIXME: It should check "blocked" too
368                                         for (;i >= 0; i--)
369                                                 if (!CanBePrimaryComposite ((int) source [i]))
370                                                         break;
371                                         i++;
372                                         // Now i is the "starter"
373                                         for (; i < cur; i++) {
374                                                 if (GetPrimaryCompositeCharIndex (source, i) != 0)
375                                                         return false;
376                                         }
377                                         break;
378                                 }
379                         }
380                         return true;
381                 }
382
383                 public static string Normalize (string source, int type)
384                 {
385                         switch (type) {
386                         default:
387                         case 2:
388                                 return Compose (source, type);
389                         case 1:
390                         case 3:
391                                 return Decompose (source, type);
392                         }
393                 }
394
395                 static byte* props;
396                 static int* mappedChars;
397                 static short* charMapIndex;
398                 static short* helperIndex;
399                 static ushort* mapIdxToComposite;
400                 static byte* combiningClass;
401
402 #if GENERATE_TABLE
403
404                 public static readonly bool IsReady = true; // always
405
406                 static Normalization ()
407                 {
408                         fixed (byte* tmp = propsArr) {
409                                 props = tmp;
410                         }
411                         fixed (int* tmp = mappedCharsArr) {
412                                 mappedChars = tmp;
413                         }
414                         fixed (short* tmp = charMapIndexArr) {
415                                 charMapIndex = tmp;
416                         }
417                         fixed (short* tmp = helperIndexArr) {
418                                 helperIndex = tmp;
419                         }
420                         fixed (ushort* tmp = mapIdxToCompositeArr) {
421                                 mapIdxToComposite = tmp;
422                         }
423                         fixed (byte* tmp = combiningClassArr) {
424                                 combiningClass = tmp;
425                         }
426                 }
427 #else
428
429                 static object forLock = new object ();
430                 public static readonly bool isReady;
431
432                 public static bool IsReady {
433                         get { return isReady; }
434                 }
435
436                 [MethodImpl (MethodImplOptions.InternalCall)]
437                 static extern void load_normalization_resource (
438                         out IntPtr props, out IntPtr mappedChars,
439                         out IntPtr charMapIndex, out IntPtr helperIndex,
440                         out IntPtr mapIdxToComposite, out IntPtr combiningClass);
441
442                 static Normalization ()
443                 {
444                         IntPtr p1, p2, p3, p4, p5, p6;
445                         lock (forLock) {
446                                 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
447                                 props = (byte*) p1;
448                                 mappedChars = (int*) p2;
449                                 charMapIndex = (short*) p3;
450                                 helperIndex = (short*) p4;
451                                 mapIdxToComposite = (ushort*) p5;
452                                 combiningClass = (byte*) p6;
453                         }
454
455                         isReady = true;
456                 }
457         }
458 }
459 #endif
460
461                 //
462                 // autogenerated code or icall to fill array runs here
463                 //
464