Merge pull request #2454 from tastywheattasteslikechicken/FixVtableAbort
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / Normalization.cs
index e1db54bcf88f586ebd1d706b7e4f656583bfc703..42a0a2a04308fe1408f7789932663f90813d9f83 100644 (file)
@@ -5,7 +5,7 @@ using System.Runtime.CompilerServices;
 
 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
 
-namespace Mono.Globalization.Unicode
+namespace System.Text
 {
        internal enum NormalizationCheck {
                Yes,
@@ -38,15 +38,6 @@ namespace Mono.Globalization.Unicode
                        return charMapIndex [NUtil.MapIdx (cp)];
                }
 
-               static int GetComposedStringLength (int ch)
-               {
-                       int start = charMapIndex [NUtil.MapIdx (ch)];
-                       int i = start;
-                       while (mappedChars [i] != 0)
-                               i++;
-                       return i - start;
-               }
-
                static byte GetCombiningClass (int c)
                {
                        return combiningClass [NUtil.Combining.ToIndex (c)];
@@ -57,48 +48,16 @@ namespace Mono.Globalization.Unicode
                        return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
                }
 
-               static short GetPrimaryCompositeHelperIndex (int cp)
+               static int GetPrimaryCompositeHelperIndex (int cp)
                {
                        return helperIndex [NUtil.Helper.ToIndex (cp)];
                }
 
-               static int GetPrimaryCompositeCharIndex (object chars, int start, int charsLength)
-               {
-                       string s = chars as string;
-                       StringBuilder sb = chars as StringBuilder;
-                       char startCh = s != null ? s [start] : sb [start];
-
-                       int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
-                       if (idx == 0)
-                               return 0;
-                       while (mappedChars [idx] == startCh) {
-                               for (int i = 1; ; i++) {
-                                       if (mappedChars [idx + i] == 0)
-                                               // match
-                                               return idx;
-                                       if (start + i < charsLength)
-                                               return 0; // no match
-                                       char curCh = s != null ?
-                                               s [start + i] : sb [start + i];
-                                       if (mappedChars [idx + i] == curCh)
-                                               continue;
-                                       if (mappedChars [idx + i] > curCh)
-                                               return 0; // no match
-                                       // otherwise move idx to next item
-                                       while (mappedChars [i] != 0)
-                                               i++;
-                                       idx = i + 1;
-                                       break;
-                               }
-                       }
-                       // reached to end of entries
-                       return 0;
-               }
-
                private static string Compose (string source, int checkType)
                {
                        StringBuilder sb = null;
-                       Decompose (source, ref sb, checkType);
+                       // Decompose to NFD or NKFD depending on our target
+                       Decompose (source, ref sb, checkType == 2 ? 3 : 1);
                        if (sb == null)
                                sb = Combine (source, 0, checkType);
                        else
@@ -112,7 +71,7 @@ namespace Mono.Globalization.Unicode
                        for (int i = 0; i < source.Length; i++) {
                                if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
                                        continue;
-                               StringBuilder sb = new StringBuilder (source.Length);
+                               StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
                                sb.Append (source);
                                Combine (sb, i, checkType);
                                return sb;
@@ -120,66 +79,164 @@ namespace Mono.Globalization.Unicode
                        return null;
                }
 
+/*
                private static bool CanBePrimaryComposite (int i)
                {
                        if (i >= 0x3400 && i <= 0x9FBB)
                                return GetPrimaryCompositeHelperIndex (i) != 0;
                        return (PropValue (i) & IsUnsafe) != 0;
                }
-
-               private static void Combine (StringBuilder sb, int start, int checkType)
+*/
+               private static void Combine (StringBuilder sb, int i, int checkType)
                {
-                       for (int i = start; i < sb.Length; i++) {
-                               switch (QuickCheck (sb [i], checkType)) {
-                               case NormalizationCheck.Yes:
+                       // Back off one character as we may be looking at a V or T jamo.
+                       CombineHangul (sb, null, i > 0 ? i - 1 : i);
+
+                       while (i < sb.Length) {
+                               if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
+                                       i++;
                                        continue;
-                               case NormalizationCheck.No:
-                                       break;
-                               case NormalizationCheck.Maybe:
-                                       if (i == 0)
-                                               continue;
-                                       else
-                                               break;
                                }
 
-                               int cur = i;
-                               // FIXME: It should check "blocked" too
-                               for (;i >= 0; i--)
-                                       if (!CanBePrimaryComposite ((int) sb [i]))
-                                               break;
-                               i++;
-                               // Now i is the "starter"
-                               int idx = 0;
-                               for (; i < cur; i++) {
-                                       idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], sb.Length, i);
-                                       if (idx > 0)
-                                               break;
+                               i = TryComposeWithPreviousStarter (sb, null, i);
+                       }
+               }
+
+               private static int CombineHangul (StringBuilder sb, string s, int current)
+               {
+                       int length = sb != null ? sb.Length : s.Length;
+                       int last = Fetch (sb, s, current);
+
+                       for (int i = current + 1; i < length; ++i) {
+                               int ch = Fetch (sb, s, i);
+
+                               // 1. check to see if two current characters are L and V
+
+                               int LIndex = last - HangulLBase;
+                               if (0 <= LIndex && LIndex < HangulLCount) {
+                                       int VIndex = ch - HangulVBase;
+                                       if (0 <= VIndex && VIndex < HangulVCount) {
+                                               if (sb == null)
+                                                       return -1;
+
+                                               // make syllable of form LV
+
+                                               last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
+
+                                               sb [i - 1] = (char) last; // reset last
+                                               sb.Remove (i, 1);
+                                               i--; length--;
+                                               continue; // discard ch
+                                       }
                                }
-                               if (idx == 0) {
-                                       i = cur;
-                                       continue;
+
+
+                               // 2. check to see if two current characters are LV and T
+
+                               int SIndex = last - HangulSBase;
+                               if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
+                                       int TIndex = ch - HangulTBase;
+                                       if (0 < TIndex && TIndex < HangulTCount) {
+                                               if (sb == null)
+                                                       return -1;
+
+                                               // make syllable of form LVT
+
+                                               last += TIndex;
+
+                                               sb [i - 1] = (char) last; // reset last
+                                               sb.Remove (i, 1);
+                                               i--; length--;
+                                               continue; // discard ch
+                                       }
                                }
-                               int ch = GetPrimaryCompositeFromMapIndex (idx);
-                               int len = GetComposedStringLength (ch);
-                               if (ch == 0 || len == 0) {
-                                       // FIXME: this actually happens
-                                       // throw new SystemException ("Internal error: should not happen.");
-                                       i = cur;
+                               // if neither case was true, just add the character
+                               last = ch;
+                       }
+
+                       return length;
+               }
+
+               static int Fetch (StringBuilder sb, string s, int i)
+               {
+                       return (int) (sb != null ? sb [i] : s [i]);
+               }
+
+               // Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
+               static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
+               {
+                       // Backtrack to previous starter.
+                       int i = current - 1;
+                       if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
+                               if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
+                                       return current + 1;
+                       } else {
+                               while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
+                                       i--;
+                               if (i < 0)
+                                       return current + 1;
+                       }
+
+                       int starter = Fetch (sb, s, i);
+
+                       // The various decompositions involving starter follow this index.
+                       int comp_idx = GetPrimaryCompositeHelperIndex (starter);
+                       if (comp_idx == 0)
+                               return current + 1;
+
+                       int length = (sb != null ? sb.Length : s.Length);
+                       int prevCombiningClass = -1;
+                       for (int j = i + 1; j < length; j++) {
+                               int candidate = Fetch (sb, s, j);
+
+                               int combiningClass = GetCombiningClass (candidate);
+                               if (combiningClass == prevCombiningClass)
+                                       // We skipped over a guy with the same class, without
+                                       // combining.  Skip this one, too.
                                        continue;
+
+                               int composed = TryCompose (comp_idx, starter, candidate);
+                               if (composed != 0) {
+                                       if (sb == null)
+                                               // Not normalized, and we are only checking.
+                                               return -1;
+
+                                       // Full Unicode warning: This will break when the underlying
+                                       // tables are extended.
+                                       sb [i] = (char) composed;
+                                       sb.Remove (j, 1);
+
+                                       return current;
                                }
-                               sb.Remove (i, len);
-                               sb.Insert (i, (char) ch); // always single character
-                               i = cur - 1; // apply recursively
+
+                               // Gray box.  We're done.
+                               if (combiningClass == 0)
+                                       return j + 1;
+
+                               prevCombiningClass = combiningClass;
                        }
+
+                       return length;
                }
 
-               static int GetPrimaryCompositeMapIndex (object o, int cur, int length, int bufferPos)
+               static int TryCompose (int i, int starter, int candidate)
                {
-                       if ((PropValue (cur) & FullCompositionExclusion) != 0)
-                               return 0;
-                       if (GetCombiningClass (cur) != 0)
-                               return 0; // not a starter
-                       return GetPrimaryCompositeCharIndex (o, bufferPos, length);
+                       while (mappedChars [i] == starter) {
+                               if (mappedChars [i + 1] == candidate &&
+                                   mappedChars [i + 2] == 0) {
+                                       int composed = GetPrimaryCompositeFromMapIndex (i);
+
+                                       if ((PropValue (composed) & FullCompositionExclusion) == 0)
+                                               return composed;
+                               }
+
+                               // Skip this entry.
+                               while (mappedChars [i] != 0)
+                                       i++;
+                               i++;
+                       }
+
+                       return 0;
                }
 
                static string Decompose (string source, int checkType)
@@ -197,10 +254,10 @@ namespace Mono.Globalization.Unicode
                        for (int i = 0; i < source.Length; i++)
                                if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
                                        DecomposeChar (ref sb, ref buf, source,
-                                               i, ref start);
+                                               i, checkType, ref start);
                        if (sb != null)
                                sb.Append (source, start, source.Length - start);
-//                     ReorderCanonical (source, ref sb, 1);
+                       ReorderCanonical (source, ref sb, 1);
                }
 
                static void ReorderCanonical (string src, ref StringBuilder sb, int start)
@@ -213,7 +270,7 @@ namespace Mono.Globalization.Unicode
                                                continue;
                                        if (GetCombiningClass (src [i - 1]) > level) {
                                                sb = new StringBuilder (src.Length);
-                                               sb.Append (src, 0, i - 1);
+                                               sb.Append (src, 0, src.Length);
                                                ReorderCanonical (src, ref sb, i);
                                                return;
                                        }
@@ -221,31 +278,32 @@ namespace Mono.Globalization.Unicode
                                return;
                        }
                        // check only with sb
-                       for (int i = start; i < sb.Length; i++) {
+                       for (int i = start; i < sb.Length; ) {
                                int level = GetCombiningClass (sb [i]);
-                               if (level == 0)
+                               if (level == 0 || GetCombiningClass (sb [i - 1]) <= level) {
+                                       i++;
                                        continue;
-                               if (GetCombiningClass (sb [i - 1]) > level) {
-                                       char c = sb [i - 1];
-                                       sb [i - 1] = sb [i];
-                                       sb [i] = c;
-                                       i--; // apply recursively
                                }
+
+                               char c = sb [i - 1];
+                               sb [i - 1] = sb [i];
+                               sb [i] = c;
+                               // Apply recursively.
+                               if (i > 1)
+                                       i--;
                        }
                }
 
                static void DecomposeChar (ref StringBuilder sb,
-                       ref int [] buf, string s, int i, ref int start)
+                       ref int [] buf, string s, int i, int checkType, ref int start)
                {
                        if (sb == null)
                                sb = new StringBuilder (s.Length + 100);
                        sb.Append (s, start, i - start);
                        if (buf == null)
                                buf = new int [19];
-                       GetCanonical (s [i], buf, 0);
-                       for (int x = 0; ; x++) {
-                               if (buf [x] == 0)
-                                       break;
+                       int n = GetCanonical (s [i], buf, 0, checkType);
+                       for (int x = 0; x < n; x++) {
                                if (buf [x] < char.MaxValue)
                                        sb.Append ((char) buf [x]);
                                else { // surrogate
@@ -313,45 +371,125 @@ namespace Mono.Globalization.Unicode
                }
                */
 
-               public static void GetCanonical (int c, int [] buf, int bufIdx)
+               const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
+                                 HangulVBase = 0x1161, HangulTBase = 0x11A7,
+                                 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
+                                 HangulNCount = HangulVCount * HangulTCount,   // 588
+                                 HangulSCount = HangulLCount * HangulNCount;   // 11172
+
+               private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
                {
-                       for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
-                               buf [bufIdx++] = mappedChars [i];
+                       int idx = s - HangulSBase;
+                       if (idx < 0 || idx >= HangulSCount) {
+                               return bufIdx;
+                       }
+
+                       int L = HangulLBase + idx / HangulNCount;
+                       int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
+                       int T = HangulTBase + idx % HangulTCount;
+
+                       buf [bufIdx++] = L;
+                       buf [bufIdx++] = V;
+                       if (T != HangulTBase) {
+                               buf [bufIdx++] = T;
+                       }
                        buf [bufIdx] = (char) 0;
+                       return bufIdx;
+               }
+
+               static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
+               {
+                       int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
+                       if (newBufIdx > bufIdx)
+                               return newBufIdx;
+                       int i = CharMapIdx (c);
+                       if (i == 0 || mappedChars [i] == c)
+                               buf [bufIdx++] = c;
+                       else {
+                               // Character c maps to one or more decomposed chars.
+                               for (; mappedChars [i] != 0; i++) {
+                                       int nth = mappedChars [i];
+
+                                       // http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
+                                       // Full decomposition involves recursive application of the
+                                       // Decomposition_Mapping values.  Note that QuickCheck does
+                                       // not currently support astral plane codepoints.
+                                       if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
+                                               buf [bufIdx++] = nth;
+                                       else
+                                               bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
+                               }
+                       }
+
+                       return bufIdx;
+               }
+
+               public static bool IsNormalized (string source, NormalizationForm normalizationForm)
+               {
+                       switch (normalizationForm) {
+                       default:
+                               return IsNormalized (source, 0);
+                       case NormalizationForm.FormD:
+                               return IsNormalized (source, 1);
+                       case NormalizationForm.FormKC:
+                               return IsNormalized (source, 2);
+                       case NormalizationForm.FormKD:
+                               return IsNormalized (source, 3);
+                       }
                }
 
                public static bool IsNormalized (string source, int type)
                {
-//                     int prevCC = -1;
-                       for (int i = 0; i < source.Length; i++) {
-//                             int cc = GetCombiningClass (source [i]);
-//                             if (cc != 0 && cc < prevCC)
-//                                     return false;
-//                             prevCC = cc;
+                       int prevCC = -1;
+                       for (int i = 0; i < source.Length; ) {
+                               int cc = GetCombiningClass (source [i]);
+                               if (cc != 0 && cc < prevCC)
+                                       return false;
+                               prevCC = cc;
+
                                switch (QuickCheck (source [i], type)) {
                                case NormalizationCheck.Yes:
+                                       i++;
                                        break;
                                case NormalizationCheck.No:
                                        return false;
                                case NormalizationCheck.Maybe:
-                                       // partly copied from Combine()
-                                       int cur = i;
-                                       // FIXME: It should check "blocked" too
-                                       for (;i >= 0; i--)
-                                               if (!CanBePrimaryComposite ((int) source [i]))
-                                                       break;
-                                       i++;
-                                       // Now i is the "starter"
-                                       for (; i < cur; i++) {
-                                               if (GetPrimaryCompositeCharIndex (source, i, source.Length) != 0)
-                                                       return false;
+                                       // for those forms with composition, it cannot be checked here
+                                       switch (type) {
+                                       case 0: // NFC
+                                       case 2: // NFKC
+                                               return source == Normalize (source, type);
                                        }
+                                       // go on...
+
+                                       i = CombineHangul (null, source, i > 0 ? i - 1 : i);
+                                       if (i < 0)
+                                               return false;
+
+                                       i = TryComposeWithPreviousStarter (null, source, i);
+                                       if (i < 0)
+                                               return false;
                                        break;
                                }
                        }
                        return true;
                }
 
+               public static string Normalize (string source, NormalizationForm normalizationForm)
+               {
+                       switch (normalizationForm) {
+                       default:
+                               return Normalization.Normalize (source, 0);
+                       case NormalizationForm.FormD:
+                               return Normalization.Normalize (source, 1);
+                       case NormalizationForm.FormKC:
+                               return Normalization.Normalize (source, 2);
+                       case NormalizationForm.FormKD:
+                               return Normalization.Normalize (source, 3);
+                       }
+               }
+
                public static string Normalize (string source, int type)
                {
                        switch (type) {
@@ -359,6 +497,7 @@ namespace Mono.Globalization.Unicode
                        case 2:
                                return Compose (source, type);
                        case 1:
+                       case 3:
                                return Decompose (source, type);
                        }
                }
@@ -406,26 +545,21 @@ namespace Mono.Globalization.Unicode
 
                [MethodImpl (MethodImplOptions.InternalCall)]
                static extern void load_normalization_resource (
-                       byte** props, byte** mappedChars, byte** charMapIndex,
-                       byte** helperIndex, byte** mapIdxToComposite,
-                       byte** combiningClass);
+                       out IntPtr props, out IntPtr mappedChars,
+                       out IntPtr charMapIndex, out IntPtr helperIndex,
+                       out IntPtr mapIdxToComposite, out IntPtr combiningClass);
 
                static Normalization ()
                {
+                       IntPtr p1, p2, p3, p4, p5, p6;
                        lock (forLock) {
-                       fixed (byte** addrProps = &props) {
-                       fixed (int** addrMappedChars = &mappedChars) {
-                       fixed (short** addrCharMapIndex = &charMapIndex) {
-                       fixed (short** addrHelperIndex = &helperIndex) {
-                       fixed (ushort** addrMapIdxToComposite = &mapIdxToComposite) {
-                       fixed (byte** addrCombiningClass = &combiningClass) {
-                               load_normalization_resource (addrProps,
-                               (byte**) addrMappedChars,
-                               (byte**) addrCharMapIndex,
-                               (byte**) addrHelperIndex,
-                               (byte**) addrMapIdxToComposite,
-                               (byte**) addrCombiningClass);
-                       } } } } } }
+                               load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
+                               props = (byte*) p1;
+                               mappedChars = (int*) p2;
+                               charMapIndex = (short*) p3;
+                               helperIndex = (short*) p4;
+                               mapIdxToComposite = (ushort*) p5;
+                               combiningClass = (byte*) p6;
                        }
 
                        isReady = true;