2 using System.Globalization;
4 using System.Runtime.CompilerServices;
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
8 namespace Mono.Globalization.Unicode
10 internal enum NormalizationCheck {
16 internal unsafe class Normalization
18 public const int NoNfd = 1;
19 public const int NoNfkd = 2;
20 public const int NoNfc = 4;
21 public const int MaybeNfc = 8;
22 public const int NoNfkc = 16;
23 public const int MaybeNfkc = 32;
24 public const int FullCompositionExclusion = 64;
25 public const int IsUnsafe = 128;
26 // public const int ExpandOnNfd = 256;
27 // public const int ExpandOnNfc = 512;
28 // public const int ExpandOnNfkd = 1024;
29 // public const int ExpandOnNfkc = 2048;
31 static uint PropValue (int cp)
33 return props [NUtil.PropIdx (cp)];
36 static int CharMapIdx (int cp)
38 return charMapIndex [NUtil.MapIdx (cp)];
41 static byte GetCombiningClass (int c)
43 return combiningClass [NUtil.Combining.ToIndex (c)];
46 static int GetPrimaryCompositeFromMapIndex (int src)
48 return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
51 static int GetPrimaryCompositeHelperIndex (int cp)
53 return helperIndex [NUtil.Helper.ToIndex (cp)];
56 private static string Compose (string source, int checkType)
58 StringBuilder sb = null;
59 Decompose (source, ref sb, checkType);
61 sb = Combine (source, 0, checkType);
63 Combine (sb, 0, checkType);
65 return sb != null ? sb.ToString () : source;
68 private static StringBuilder Combine (string source, int start, int checkType)
70 for (int i = 0; i < source.Length; i++) {
71 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
73 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
75 Combine (sb, i, checkType);
82 private static bool CanBePrimaryComposite (int i)
84 if (i >= 0x3400 && i <= 0x9FBB)
85 return GetPrimaryCompositeHelperIndex (i) != 0;
86 return (PropValue (i) & IsUnsafe) != 0;
89 private static void Combine (StringBuilder sb, int i, int checkType)
91 // Back off one character as we may be looking at a V or T jamo.
92 CombineHangul (sb, null, i > 0 ? i - 1 : i);
94 while (i < sb.Length) {
95 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
100 i = TryComposeWithPreviousStarter (sb, null, i);
104 private static int CombineHangul (StringBuilder sb, string s, int current)
106 int length = sb != null ? sb.Length : s.Length;
107 int last = Fetch (sb, s, current);
109 for (int i = current + 1; i < length; ++i) {
110 int ch = Fetch (sb, s, i);
112 // 1. check to see if two current characters are L and V
114 int LIndex = last - HangulLBase;
115 if (0 <= LIndex && LIndex < HangulLCount) {
116 int VIndex = ch - HangulVBase;
117 if (0 <= VIndex && VIndex < HangulVCount) {
121 // make syllable of form LV
123 last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
125 sb [i - 1] = (char) last; // reset last
128 continue; // discard ch
133 // 2. check to see if two current characters are LV and T
135 int SIndex = last - HangulSBase;
136 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
137 int TIndex = ch - HangulTBase;
138 if (0 < TIndex && TIndex < HangulTCount) {
142 // make syllable of form LVT
146 sb [i - 1] = (char) last; // reset last
149 continue; // discard ch
152 // if neither case was true, just add the character
159 static int Fetch (StringBuilder sb, string s, int i)
161 return (int) (sb != null ? sb [i] : s [i]);
164 // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
165 static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
167 // Backtrack to previous starter.
169 if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
170 if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
173 while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
179 int starter = Fetch (sb, s, i);
181 // The various decompositions involving starter follow this index.
182 int comp_idx = GetPrimaryCompositeHelperIndex (starter);
186 int length = (sb != null ? sb.Length : s.Length);
187 int prevCombiningClass = -1;
188 for (int j = i + 1; j < length; j++) {
189 int candidate = Fetch (sb, s, j);
191 int combiningClass = GetCombiningClass (candidate);
192 if (combiningClass == prevCombiningClass)
193 // We skipped over a guy with the same class, without
194 // combining. Skip this one, too.
197 int composed = TryCompose (comp_idx, starter, candidate);
200 // Not normalized, and we are only checking.
203 // Full Unicode warning: This will break when the underlying
204 // tables are extended.
205 sb [i] = (char) composed;
211 // Gray box. We're done.
212 if (combiningClass == 0)
215 prevCombiningClass = combiningClass;
221 static int TryCompose (int i, int starter, int candidate)
223 while (mappedChars [i] == starter) {
224 if (mappedChars [i + 1] == candidate &&
225 mappedChars [i + 2] == 0) {
226 int composed = GetPrimaryCompositeFromMapIndex (i);
228 if ((PropValue (composed) & FullCompositionExclusion) == 0)
233 while (mappedChars [i] != 0)
241 static string Decompose (string source, int checkType)
243 StringBuilder sb = null;
244 Decompose (source, ref sb, checkType);
245 return sb != null ? sb.ToString () : source;
248 static void Decompose (string source,
249 ref StringBuilder sb, int checkType)
253 for (int i = 0; i < source.Length; i++)
254 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
255 DecomposeChar (ref sb, ref buf, source,
256 i, checkType, ref start);
258 sb.Append (source, start, source.Length - start);
259 ReorderCanonical (source, ref sb, 1);
262 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
265 // check only with src.
266 for (int i = 1; i < src.Length; i++) {
267 int level = GetCombiningClass (src [i]);
270 if (GetCombiningClass (src [i - 1]) > level) {
271 sb = new StringBuilder (src.Length);
272 sb.Append (src, 0, src.Length);
273 ReorderCanonical (src, ref sb, i);
279 // check only with sb
280 for (int i = start; i < sb.Length; i++) {
281 int level = GetCombiningClass (sb [i]);
284 if (GetCombiningClass (sb [i - 1]) > level) {
288 i--; // apply recursively
293 static void DecomposeChar (ref StringBuilder sb,
294 ref int [] buf, string s, int i, int checkType, ref int start)
297 sb = new StringBuilder (s.Length + 100);
298 sb.Append (s, start, i - start);
301 int n = GetCanonical (s [i], buf, 0, checkType);
302 for (int x = 0; x < n; x++) {
303 if (buf [x] < char.MaxValue)
304 sb.Append ((char) buf [x]);
306 sb.Append ((char) (buf [x] >> 10 + 0xD800));
307 sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
313 public static NormalizationCheck QuickCheck (char c, int type)
318 v = PropValue ((int) c);
319 return (v & NoNfc) == 0 ?
320 (v & MaybeNfc) == 0 ?
321 NormalizationCheck.Yes :
322 NormalizationCheck.Maybe :
323 NormalizationCheck.No;
325 if ('\uAC00' <= c && c <= '\uD7A3')
326 return NormalizationCheck.No;
327 return (PropValue ((int) c) & NoNfd) != 0 ?
328 NormalizationCheck.No : NormalizationCheck.Yes;
330 v = PropValue ((int) c);
331 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
332 (v & MaybeNfkc) != 0 ?
333 NormalizationCheck.Maybe :
334 NormalizationCheck.Yes;
336 if ('\uAC00' <= c && c <= '\uD7A3')
337 return NormalizationCheck.No;
338 return (PropValue ((int) c) & NoNfkd) != 0 ?
339 NormalizationCheck.No : NormalizationCheck.Yes;
343 /* for now we don't use FC_NFKC closure
344 public static bool IsMultiForm (char c)
346 return (PropValue ((int) c) & 0xF0000000) != 0;
349 public static char SingleForm (char c)
351 uint v = PropValue ((int) c);
352 int idx = (int) ((v & 0x7FFF0000) >> 16);
353 return (char) singleNorm [idx];
356 public static void MultiForm (char c, char [] buf, int index)
358 // FIXME: handle surrogate
359 uint v = PropValue ((int) c);
360 int midx = (int) ((v & 0x7FFF0000) >> 16);
361 buf [index] = (char) multiNorm [midx];
362 buf [index + 1] = (char) multiNorm [midx + 1];
363 buf [index + 2] = (char) multiNorm [midx + 2];
364 buf [index + 3] = (char) multiNorm [midx + 3];
365 if (buf [index + 3] != 0)
366 buf [index + 4] = (char) 0; // zero termination
370 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
371 HangulVBase = 0x1161, HangulTBase = 0x11A7,
372 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
373 HangulNCount = HangulVCount * HangulTCount, // 588
374 HangulSCount = HangulLCount * HangulNCount; // 11172
376 private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
378 int idx = s - HangulSBase;
379 if (idx < 0 || idx >= HangulSCount) {
383 int L = HangulLBase + idx / HangulNCount;
384 int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
385 int T = HangulTBase + idx % HangulTCount;
389 if (T != HangulTBase) {
392 buf [bufIdx] = (char) 0;
396 static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
398 int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
399 if (newBufIdx > bufIdx)
402 int i = CharMapIdx (c);
403 if (i == 0 || mappedChars [i] == c)
406 // Character c maps to one or more decomposed chars.
407 for (; mappedChars [i] != 0; i++) {
408 int nth = mappedChars [i];
410 // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
411 // Full decomposition involves recursive application of the
412 // Decomposition_Mapping values. Note that QuickCheck does
413 // not currently support astral plane codepoints.
414 if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
415 buf [bufIdx++] = nth;
417 bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
424 public static bool IsNormalized (string source, int type)
427 for (int i = 0; i < source.Length; ) {
428 int cc = GetCombiningClass (source [i]);
429 if (cc != 0 && cc < prevCC)
433 switch (QuickCheck (source [i], type)) {
434 case NormalizationCheck.Yes:
437 case NormalizationCheck.No:
439 case NormalizationCheck.Maybe:
440 // for those forms with composition, it cannot be checked here
444 return source == Normalize (source, type);
448 i = CombineHangul (null, source, i > 0 ? i - 1 : i);
452 i = TryComposeWithPreviousStarter (null, source, i);
461 public static string Normalize (string source, int type)
466 return Compose (source, type);
469 return Decompose (source, type);
474 static int* mappedChars;
475 static short* charMapIndex;
476 static short* helperIndex;
477 static ushort* mapIdxToComposite;
478 static byte* combiningClass;
482 public static readonly bool IsReady = true; // always
484 static Normalization ()
486 fixed (byte* tmp = propsArr) {
489 fixed (int* tmp = mappedCharsArr) {
492 fixed (short* tmp = charMapIndexArr) {
495 fixed (short* tmp = helperIndexArr) {
498 fixed (ushort* tmp = mapIdxToCompositeArr) {
499 mapIdxToComposite = tmp;
501 fixed (byte* tmp = combiningClassArr) {
502 combiningClass = tmp;
507 static object forLock = new object ();
508 public static readonly bool isReady;
510 public static bool IsReady {
511 get { return isReady; }
514 [MethodImpl (MethodImplOptions.InternalCall)]
515 static extern void load_normalization_resource (
516 out IntPtr props, out IntPtr mappedChars,
517 out IntPtr charMapIndex, out IntPtr helperIndex,
518 out IntPtr mapIdxToComposite, out IntPtr combiningClass);
520 static Normalization ()
522 IntPtr p1, p2, p3, p4, p5, p6;
524 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
526 mappedChars = (int*) p2;
527 charMapIndex = (short*) p3;
528 helperIndex = (short*) p4;
529 mapIdxToComposite = (ushort*) p5;
530 combiningClass = (byte*) p6;
540 // autogenerated code or icall to fill array runs here