2 using System.Globalization;
4 using System.Runtime.CompilerServices;
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
10 internal enum NormalizationCheck {
16 internal unsafe class Normalization
18 public const int NoNfd = 1;
19 public const int NoNfkd = 2;
20 public const int NoNfc = 4;
21 public const int MaybeNfc = 8;
22 public const int NoNfkc = 16;
23 public const int MaybeNfkc = 32;
24 public const int FullCompositionExclusion = 64;
25 public const int IsUnsafe = 128;
26 // public const int ExpandOnNfd = 256;
27 // public const int ExpandOnNfc = 512;
28 // public const int ExpandOnNfkd = 1024;
29 // public const int ExpandOnNfkc = 2048;
31 static uint PropValue (int cp)
33 return props [NUtil.PropIdx (cp)];
36 static int CharMapIdx (int cp)
38 return charMapIndex [NUtil.MapIdx (cp)];
41 static byte GetCombiningClass (int c)
43 return combiningClass [NUtil.Combining.ToIndex (c)];
46 static int GetPrimaryCompositeFromMapIndex (int src)
48 return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
51 static int GetPrimaryCompositeHelperIndex (int cp)
53 return helperIndex [NUtil.Helper.ToIndex (cp)];
56 private static string Compose (string source, int checkType)
58 StringBuilder sb = null;
59 // Decompose to NFD or NKFD depending on our target
60 Decompose (source, ref sb, checkType == 2 ? 3 : 1);
62 sb = Combine (source, 0, checkType);
64 Combine (sb, 0, checkType);
66 return sb != null ? sb.ToString () : source;
69 private static StringBuilder Combine (string source, int start, int checkType)
71 for (int i = 0; i < source.Length; i++) {
72 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
74 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
76 Combine (sb, i, checkType);
83 private static bool CanBePrimaryComposite (int i)
85 if (i >= 0x3400 && i <= 0x9FBB)
86 return GetPrimaryCompositeHelperIndex (i) != 0;
87 return (PropValue (i) & IsUnsafe) != 0;
90 private static void Combine (StringBuilder sb, int i, int checkType)
92 // Back off one character as we may be looking at a V or T jamo.
93 CombineHangul (sb, null, i > 0 ? i - 1 : i);
95 while (i < sb.Length) {
96 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
101 i = TryComposeWithPreviousStarter (sb, null, i);
105 private static int CombineHangul (StringBuilder sb, string s, int current)
107 int length = sb != null ? sb.Length : s.Length;
108 int last = Fetch (sb, s, current);
110 for (int i = current + 1; i < length; ++i) {
111 int ch = Fetch (sb, s, i);
113 // 1. check to see if two current characters are L and V
115 int LIndex = last - HangulLBase;
116 if (0 <= LIndex && LIndex < HangulLCount) {
117 int VIndex = ch - HangulVBase;
118 if (0 <= VIndex && VIndex < HangulVCount) {
122 // make syllable of form LV
124 last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
126 sb [i - 1] = (char) last; // reset last
129 continue; // discard ch
134 // 2. check to see if two current characters are LV and T
136 int SIndex = last - HangulSBase;
137 if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
138 int TIndex = ch - HangulTBase;
139 if (0 < TIndex && TIndex < HangulTCount) {
143 // make syllable of form LVT
147 sb [i - 1] = (char) last; // reset last
150 continue; // discard ch
153 // if neither case was true, just add the character
160 static int Fetch (StringBuilder sb, string s, int i)
162 return (int) (sb != null ? sb [i] : s [i]);
165 // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
166 static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
168 // Backtrack to previous starter.
170 if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
171 if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
174 while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
180 int starter = Fetch (sb, s, i);
182 // The various decompositions involving starter follow this index.
183 int comp_idx = GetPrimaryCompositeHelperIndex (starter);
187 int length = (sb != null ? sb.Length : s.Length);
188 int prevCombiningClass = -1;
189 for (int j = i + 1; j < length; j++) {
190 int candidate = Fetch (sb, s, j);
192 int combiningClass = GetCombiningClass (candidate);
193 if (combiningClass == prevCombiningClass)
194 // We skipped over a guy with the same class, without
195 // combining. Skip this one, too.
198 int composed = TryCompose (comp_idx, starter, candidate);
201 // Not normalized, and we are only checking.
204 // Full Unicode warning: This will break when the underlying
205 // tables are extended.
206 sb [i] = (char) composed;
212 // Gray box. We're done.
213 if (combiningClass == 0)
216 prevCombiningClass = combiningClass;
222 static int TryCompose (int i, int starter, int candidate)
224 while (mappedChars [i] == starter) {
225 if (mappedChars [i + 1] == candidate &&
226 mappedChars [i + 2] == 0) {
227 int composed = GetPrimaryCompositeFromMapIndex (i);
229 if ((PropValue (composed) & FullCompositionExclusion) == 0)
234 while (mappedChars [i] != 0)
242 static string Decompose (string source, int checkType)
244 StringBuilder sb = null;
245 Decompose (source, ref sb, checkType);
246 return sb != null ? sb.ToString () : source;
249 static void Decompose (string source,
250 ref StringBuilder sb, int checkType)
254 for (int i = 0; i < source.Length; i++)
255 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
256 DecomposeChar (ref sb, ref buf, source,
257 i, checkType, ref start);
259 sb.Append (source, start, source.Length - start);
260 ReorderCanonical (source, ref sb, 1);
263 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
266 // check only with src.
267 for (int i = 1; i < src.Length; i++) {
268 int level = GetCombiningClass (src [i]);
271 if (GetCombiningClass (src [i - 1]) > level) {
272 sb = new StringBuilder (src.Length);
273 sb.Append (src, 0, src.Length);
274 ReorderCanonical (src, ref sb, i);
280 // check only with sb
281 for (int i = start; i < sb.Length; ) {
282 int level = GetCombiningClass (sb [i]);
283 if (level == 0 || GetCombiningClass (sb [i - 1]) <= level) {
291 // Apply recursively.
297 static void DecomposeChar (ref StringBuilder sb,
298 ref int [] buf, string s, int i, int checkType, ref int start)
301 sb = new StringBuilder (s.Length + 100);
302 sb.Append (s, start, i - start);
305 int n = GetCanonical (s [i], buf, 0, checkType);
306 for (int x = 0; x < n; x++) {
307 if (buf [x] < char.MaxValue)
308 sb.Append ((char) buf [x]);
310 sb.Append ((char) (buf [x] >> 10 + 0xD800));
311 sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
317 public static NormalizationCheck QuickCheck (char c, int type)
322 v = PropValue ((int) c);
323 return (v & NoNfc) == 0 ?
324 (v & MaybeNfc) == 0 ?
325 NormalizationCheck.Yes :
326 NormalizationCheck.Maybe :
327 NormalizationCheck.No;
329 if ('\uAC00' <= c && c <= '\uD7A3')
330 return NormalizationCheck.No;
331 return (PropValue ((int) c) & NoNfd) != 0 ?
332 NormalizationCheck.No : NormalizationCheck.Yes;
334 v = PropValue ((int) c);
335 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
336 (v & MaybeNfkc) != 0 ?
337 NormalizationCheck.Maybe :
338 NormalizationCheck.Yes;
340 if ('\uAC00' <= c && c <= '\uD7A3')
341 return NormalizationCheck.No;
342 return (PropValue ((int) c) & NoNfkd) != 0 ?
343 NormalizationCheck.No : NormalizationCheck.Yes;
347 /* for now we don't use FC_NFKC closure
348 public static bool IsMultiForm (char c)
350 return (PropValue ((int) c) & 0xF0000000) != 0;
353 public static char SingleForm (char c)
355 uint v = PropValue ((int) c);
356 int idx = (int) ((v & 0x7FFF0000) >> 16);
357 return (char) singleNorm [idx];
360 public static void MultiForm (char c, char [] buf, int index)
362 // FIXME: handle surrogate
363 uint v = PropValue ((int) c);
364 int midx = (int) ((v & 0x7FFF0000) >> 16);
365 buf [index] = (char) multiNorm [midx];
366 buf [index + 1] = (char) multiNorm [midx + 1];
367 buf [index + 2] = (char) multiNorm [midx + 2];
368 buf [index + 3] = (char) multiNorm [midx + 3];
369 if (buf [index + 3] != 0)
370 buf [index + 4] = (char) 0; // zero termination
374 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
375 HangulVBase = 0x1161, HangulTBase = 0x11A7,
376 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
377 HangulNCount = HangulVCount * HangulTCount, // 588
378 HangulSCount = HangulLCount * HangulNCount; // 11172
380 private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
382 int idx = s - HangulSBase;
383 if (idx < 0 || idx >= HangulSCount) {
387 int L = HangulLBase + idx / HangulNCount;
388 int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
389 int T = HangulTBase + idx % HangulTCount;
393 if (T != HangulTBase) {
396 buf [bufIdx] = (char) 0;
400 static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
402 int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
403 if (newBufIdx > bufIdx)
406 int i = CharMapIdx (c);
407 if (i == 0 || mappedChars [i] == c)
410 // Character c maps to one or more decomposed chars.
411 for (; mappedChars [i] != 0; i++) {
412 int nth = mappedChars [i];
414 // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
415 // Full decomposition involves recursive application of the
416 // Decomposition_Mapping values. Note that QuickCheck does
417 // not currently support astral plane codepoints.
418 if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
419 buf [bufIdx++] = nth;
421 bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
428 public static bool IsNormalized (string source, NormalizationForm normalizationForm)
430 switch (normalizationForm) {
432 return IsNormalized (source, 0);
433 case NormalizationForm.FormD:
434 return IsNormalized (source, 1);
435 case NormalizationForm.FormKC:
436 return IsNormalized (source, 2);
437 case NormalizationForm.FormKD:
438 return IsNormalized (source, 3);
442 public static bool IsNormalized (string source, int type)
445 for (int i = 0; i < source.Length; ) {
446 int cc = GetCombiningClass (source [i]);
447 if (cc != 0 && cc < prevCC)
451 switch (QuickCheck (source [i], type)) {
452 case NormalizationCheck.Yes:
455 case NormalizationCheck.No:
457 case NormalizationCheck.Maybe:
458 // for those forms with composition, it cannot be checked here
462 return source == Normalize (source, type);
466 i = CombineHangul (null, source, i > 0 ? i - 1 : i);
470 i = TryComposeWithPreviousStarter (null, source, i);
479 public static string Normalize (string source, NormalizationForm normalizationForm)
481 switch (normalizationForm) {
483 return Normalization.Normalize (source, 0);
484 case NormalizationForm.FormD:
485 return Normalization.Normalize (source, 1);
486 case NormalizationForm.FormKC:
487 return Normalization.Normalize (source, 2);
488 case NormalizationForm.FormKD:
489 return Normalization.Normalize (source, 3);
493 public static string Normalize (string source, int type)
498 return Compose (source, type);
501 return Decompose (source, type);
506 static int* mappedChars;
507 static short* charMapIndex;
508 static short* helperIndex;
509 static ushort* mapIdxToComposite;
510 static byte* combiningClass;
514 public static readonly bool IsReady = true; // always
516 static Normalization ()
518 fixed (byte* tmp = propsArr) {
521 fixed (int* tmp = mappedCharsArr) {
524 fixed (short* tmp = charMapIndexArr) {
527 fixed (short* tmp = helperIndexArr) {
530 fixed (ushort* tmp = mapIdxToCompositeArr) {
531 mapIdxToComposite = tmp;
533 fixed (byte* tmp = combiningClassArr) {
534 combiningClass = tmp;
539 static object forLock = new object ();
540 public static readonly bool isReady;
542 public static bool IsReady {
543 get { return isReady; }
546 [MethodImpl (MethodImplOptions.InternalCall)]
547 static extern void load_normalization_resource (
548 out IntPtr props, out IntPtr mappedChars,
549 out IntPtr charMapIndex, out IntPtr helperIndex,
550 out IntPtr mapIdxToComposite, out IntPtr combiningClass);
552 static Normalization ()
554 IntPtr p1, p2, p3, p4, p5, p6;
556 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
558 mappedChars = (int*) p2;
559 charMapIndex = (short*) p3;
560 helperIndex = (short*) p4;
561 mapIdxToComposite = (ushort*) p5;
562 combiningClass = (byte*) p6;
572 // autogenerated code or icall to fill array runs here