using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
-namespace Mono.Globalization.Unicode
+namespace System.Text
{
internal enum NormalizationCheck {
Yes,
return charMapIndex [NUtil.MapIdx (cp)];
}
- static int GetComposedStringLength (int ch)
- {
- int start = charMapIndex [NUtil.MapIdx (ch)];
- int i = start;
- while (mappedChars [i] != 0)
- i++;
- return i - start;
- }
-
static byte GetCombiningClass (int c)
{
return combiningClass [NUtil.Combining.ToIndex (c)];
return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
}
- static short GetPrimaryCompositeHelperIndex (int cp)
+ static int GetPrimaryCompositeHelperIndex (int cp)
{
return helperIndex [NUtil.Helper.ToIndex (cp)];
}
- static int GetPrimaryCompositeCharIndex (object chars, int start)
- {
- string s = chars as string;
- StringBuilder sb = chars as StringBuilder;
- char startCh = s != null ? s [start] : sb [start];
- int charsLength = sb != null ? sb.Length : s.Length;
-
- int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
- if (idx == 0)
- return 0;
- while (mappedChars [idx] == startCh) {
- for (int i = 1, j = 1; ; i++, j++) {
- if (mappedChars [idx + i] == 0)
- // matched
- return idx;
- if (start + i >= charsLength)
- return 0; // didn't match
-
- // handle blocked characters here.
- char curCh;
- int combiningClass;
- int nextCB = 0;
- do {
- curCh = s != null ?
- s [start + j] :
- sb [start + j];
- combiningClass = GetCombiningClass (curCh);
- if (++j + start >= charsLength ||
- combiningClass == 0)
- break;
- nextCB = GetCombiningClass (
- s != null ?
- s [start + j] :
- sb [start + j]);
- } while (nextCB > 0 && combiningClass >= nextCB);
- j--;
- if (mappedChars [idx + i] == curCh)
- continue;
- if (mappedChars [idx + i] > curCh)
- return 0; // no match
- // otherwise move idx to next item
- while (mappedChars [i] != 0)
- i++;
- idx += i + 1;
- break;
- }
- }
- // reached to end of entries
- return 0;
- }
-
private static string Compose (string source, int checkType)
{
StringBuilder sb = null;
- Decompose (source, ref sb, checkType);
+ // Decompose to NFD or NKFD depending on our target
+ Decompose (source, ref sb, checkType == 2 ? 3 : 1);
if (sb == null)
sb = Combine (source, 0, checkType);
else
return null;
}
+/*
private static bool CanBePrimaryComposite (int i)
{
if (i >= 0x3400 && i <= 0x9FBB)
return GetPrimaryCompositeHelperIndex (i) != 0;
return (PropValue (i) & IsUnsafe) != 0;
}
-
- private static void Combine (StringBuilder sb, int start, int checkType)
+*/
+ private static void Combine (StringBuilder sb, int i, int checkType)
{
- for (int i = start; i < sb.Length; i++) {
- switch (QuickCheck (sb [i], checkType)) {
- case NormalizationCheck.Yes:
+ // Back off one character as we may be looking at a V or T jamo.
+ CombineHangul (sb, null, i > 0 ? i - 1 : i);
+
+ while (i < sb.Length) {
+ if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
+ i++;
continue;
- case NormalizationCheck.No:
- break;
- case NormalizationCheck.Maybe:
- if (i == 0)
- continue;
- else
- break;
}
- int cur = i;
- // FIXME: It should check "blocked" too
- for (;i >= 0; i--)
- if (!CanBePrimaryComposite ((int) sb [i]))
- break;
- i++;
- int idx = 0;
- for (; i < cur; i++) {
- idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
- if (idx > 0)
- break;
- }
- if (idx == 0) {
- i = cur;
- continue;
+ i = TryComposeWithPreviousStarter (sb, null, i);
+ }
+ }
+
+ private static int CombineHangul (StringBuilder sb, string s, int current)
+ {
+ int length = sb != null ? sb.Length : s.Length;
+ int last = Fetch (sb, s, current);
+
+ for (int i = current + 1; i < length; ++i) {
+ int ch = Fetch (sb, s, i);
+
+ // 1. check to see if two current characters are L and V
+
+ int LIndex = last - HangulLBase;
+ if (0 <= LIndex && LIndex < HangulLCount) {
+ int VIndex = ch - HangulVBase;
+ if (0 <= VIndex && VIndex < HangulVCount) {
+ if (sb == null)
+ return -1;
+
+ // make syllable of form LV
+
+ last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
+
+ sb [i - 1] = (char) last; // reset last
+ sb.Remove (i, 1);
+ i--; length--;
+ continue; // discard ch
+ }
}
- int ch = GetPrimaryCompositeFromMapIndex (idx);
- int len = GetComposedStringLength (ch);
- if (ch == 0 || len == 0)
- throw new SystemException ("Internal error: should not happen.");
- int removed = 0;
- sb.Insert (i++, (char) ch); // always single character
-
- // handle blocked characters here.
- while (removed < len) {
- if (i + 1 < sb.Length) {
- int cb = GetCombiningClass (sb [i]);
- if (cb > 0) {
- int next = GetCombiningClass (sb [i + 1]);
- if (next != 0 && cb >= next) {
- i++;
- continue;
- }
- }
+
+
+ // 2. check to see if two current characters are LV and T
+
+ int SIndex = last - HangulSBase;
+ if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
+ int TIndex = ch - HangulTBase;
+ if (0 < TIndex && TIndex < HangulTCount) {
+ if (sb == null)
+ return -1;
+
+ // make syllable of form LVT
+
+ last += TIndex;
+
+ sb [i - 1] = (char) last; // reset last
+ sb.Remove (i, 1);
+ i--; length--;
+ continue; // discard ch
}
- sb.Remove (i, 1);
- removed++;
}
- i = cur - 1; // apply recursively
+ // if neither case was true, just add the character
+ last = ch;
+ }
+
+ return length;
+ }
+
+ static int Fetch (StringBuilder sb, string s, int i)
+ {
+ return (int) (sb != null ? sb [i] : s [i]);
+ }
+
+ // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
+ static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
+ {
+ // Backtrack to previous starter.
+ int i = current - 1;
+ if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
+ if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
+ return current + 1;
+ } else {
+ while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
+ i--;
+ if (i < 0)
+ return current + 1;
+ }
+
+ int starter = Fetch (sb, s, i);
+
+ // The various decompositions involving starter follow this index.
+ int comp_idx = GetPrimaryCompositeHelperIndex (starter);
+ if (comp_idx == 0)
+ return current + 1;
+
+ int length = (sb != null ? sb.Length : s.Length);
+ int prevCombiningClass = -1;
+ for (int j = i + 1; j < length; j++) {
+ int candidate = Fetch (sb, s, j);
+
+ int combiningClass = GetCombiningClass (candidate);
+ if (combiningClass == prevCombiningClass)
+ // We skipped over a guy with the same class, without
+ // combining. Skip this one, too.
+ continue;
+
+ int composed = TryCompose (comp_idx, starter, candidate);
+ if (composed != 0) {
+ if (sb == null)
+ // Not normalized, and we are only checking.
+ return -1;
+
+ // Full Unicode warning: This will break when the underlying
+ // tables are extended.
+ sb [i] = (char) composed;
+ sb.Remove (j, 1);
+
+ return current;
+ }
+
+ // Gray box. We're done.
+ if (combiningClass == 0)
+ return j + 1;
+
+ prevCombiningClass = combiningClass;
}
+
+ return length;
}
- static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
+ static int TryCompose (int i, int starter, int candidate)
{
- if ((PropValue (cur) & FullCompositionExclusion) != 0)
- return 0;
- if (GetCombiningClass (cur) != 0)
- return 0; // not a starter
- return GetPrimaryCompositeCharIndex (o, bufferPos);
+ while (mappedChars [i] == starter) {
+ if (mappedChars [i + 1] == candidate &&
+ mappedChars [i + 2] == 0) {
+ int composed = GetPrimaryCompositeFromMapIndex (i);
+
+ if ((PropValue (composed) & FullCompositionExclusion) == 0)
+ return composed;
+ }
+
+ // Skip this entry.
+ while (mappedChars [i] != 0)
+ i++;
+ i++;
+ }
+
+ return 0;
}
static string Decompose (string source, int checkType)
for (int i = 0; i < source.Length; i++)
if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
DecomposeChar (ref sb, ref buf, source,
- i, ref start);
+ i, checkType, ref start);
if (sb != null)
sb.Append (source, start, source.Length - start);
ReorderCanonical (source, ref sb, 1);
return;
}
// check only with sb
- for (int i = start; i < sb.Length; i++) {
+ for (int i = start; i < sb.Length; ) {
int level = GetCombiningClass (sb [i]);
- if (level == 0)
+ if (level == 0 || GetCombiningClass (sb [i - 1]) <= level) {
+ i++;
continue;
- if (GetCombiningClass (sb [i - 1]) > level) {
- char c = sb [i - 1];
- sb [i - 1] = sb [i];
- sb [i] = c;
- i--; // apply recursively
}
+
+ char c = sb [i - 1];
+ sb [i - 1] = sb [i];
+ sb [i] = c;
+ // Apply recursively.
+ if (i > 1)
+ i--;
}
}
static void DecomposeChar (ref StringBuilder sb,
- ref int [] buf, string s, int i, ref int start)
+ ref int [] buf, string s, int i, int checkType, ref int start)
{
if (sb == null)
sb = new StringBuilder (s.Length + 100);
sb.Append (s, start, i - start);
if (buf == null)
buf = new int [19];
- GetCanonical (s [i], buf, 0);
- for (int x = 0; ; x++) {
- if (buf [x] == 0)
- break;
+ int n = GetCanonical (s [i], buf, 0, checkType);
+ for (int x = 0; x < n; x++) {
if (buf [x] < char.MaxValue)
sb.Append ((char) buf [x]);
else { // surrogate
}
*/
- public static void GetCanonical (int c, int [] buf, int bufIdx)
+ const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
+ HangulVBase = 0x1161, HangulTBase = 0x11A7,
+ HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
+ HangulNCount = HangulVCount * HangulTCount, // 588
+ HangulSCount = HangulLCount * HangulNCount; // 11172
+
+ private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
{
- for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
- buf [bufIdx++] = mappedChars [i];
+ int idx = s - HangulSBase;
+ if (idx < 0 || idx >= HangulSCount) {
+ return bufIdx;
+ }
+
+ int L = HangulLBase + idx / HangulNCount;
+ int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
+ int T = HangulTBase + idx % HangulTCount;
+
+ buf [bufIdx++] = L;
+ buf [bufIdx++] = V;
+ if (T != HangulTBase) {
+ buf [bufIdx++] = T;
+ }
buf [bufIdx] = (char) 0;
+ return bufIdx;
+ }
+
+ static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
+ {
+ int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
+ if (newBufIdx > bufIdx)
+ return newBufIdx;
+
+ int i = CharMapIdx (c);
+ if (i == 0 || mappedChars [i] == c)
+ buf [bufIdx++] = c;
+ else {
+ // Character c maps to one or more decomposed chars.
+ for (; mappedChars [i] != 0; i++) {
+ int nth = mappedChars [i];
+
+ // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
+ // Full decomposition involves recursive application of the
+ // Decomposition_Mapping values. Note that QuickCheck does
+ // not currently support astral plane codepoints.
+ if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
+ buf [bufIdx++] = nth;
+ else
+ bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
+ }
+ }
+
+ return bufIdx;
+ }
+
+ public static bool IsNormalized (string source, NormalizationForm normalizationForm)
+ {
+ switch (normalizationForm) {
+ default:
+ return IsNormalized (source, 0);
+ case NormalizationForm.FormD:
+ return IsNormalized (source, 1);
+ case NormalizationForm.FormKC:
+ return IsNormalized (source, 2);
+ case NormalizationForm.FormKD:
+ return IsNormalized (source, 3);
+ }
}
public static bool IsNormalized (string source, int type)
{
int prevCC = -1;
- for (int i = 0; i < source.Length; i++) {
+ for (int i = 0; i < source.Length; ) {
int cc = GetCombiningClass (source [i]);
if (cc != 0 && cc < prevCC)
return false;
prevCC = cc;
+
switch (QuickCheck (source [i], type)) {
case NormalizationCheck.Yes:
+ i++;
break;
case NormalizationCheck.No:
return false;
case NormalizationCheck.Maybe:
- // partly copied from Combine()
- int cur = i;
- // FIXME: It should check "blocked" too
- for (;i >= 0; i--)
- if (!CanBePrimaryComposite ((int) source [i]))
- break;
- i++;
- // Now i is the "starter"
- for (; i < cur; i++) {
- if (GetPrimaryCompositeCharIndex (source, i) != 0)
- return false;
+ // for those forms with composition, it cannot be checked here
+ switch (type) {
+ case 0: // NFC
+ case 2: // NFKC
+ return source == Normalize (source, type);
}
+ // go on...
+
+ i = CombineHangul (null, source, i > 0 ? i - 1 : i);
+ if (i < 0)
+ return false;
+
+ i = TryComposeWithPreviousStarter (null, source, i);
+ if (i < 0)
+ return false;
break;
}
}
return true;
}
+ public static string Normalize (string source, NormalizationForm normalizationForm)
+ {
+ switch (normalizationForm) {
+ default:
+ return Normalization.Normalize (source, 0);
+ case NormalizationForm.FormD:
+ return Normalization.Normalize (source, 1);
+ case NormalizationForm.FormKC:
+ return Normalization.Normalize (source, 2);
+ case NormalizationForm.FormKD:
+ return Normalization.Normalize (source, 3);
+ }
+ }
+
public static string Normalize (string source, int type)
{
switch (type) {
case 2:
return Compose (source, type);
case 1:
+ case 3:
return Decompose (source, type);
}
}