2009-09-18 Atsushi Enomoto <atsushi@ximian.com>
authorAtsushi Eno <atsushieno@gmail.com>
Sat, 19 Sep 2009 04:37:12 +0000 (04:37 -0000)
committerAtsushi Eno <atsushieno@gmail.com>
Sat, 19 Sep 2009 04:37:12 +0000 (04:37 -0000)
* Normalization.cs : Handle blocked characters which are not
  immediately next to the primary composite character. This fixes
  some Arabic string sequence normalization.
* Makefile : fix test build.

* StringTest.cs : add more normalization tests.

svn path=/trunk/mcs/; revision=142249

mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
mcs/class/corlib/Mono.Globalization.Unicode/Makefile
mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
mcs/class/corlib/Test/System/ChangeLog
mcs/class/corlib/Test/System/StringTest.cs

index 79166d88b2026b568d01bd557e4480aad343bce6..9f55f9d4da25691acd221a91e908feb27af63555 100644 (file)
@@ -1,3 +1,10 @@
+2009-09-18  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * Normalization.cs : Handle blocked characters which are not
+         immediately next to the primary composite character. This fixes
+         some Arabic string sequence normalization.
+       * Makefile : fix test build.
+
 2009-09-17  Atsushi Enomoto  <atsushi@ximian.com>
 
        * Normalization.cs : some renaming for disambiguation.
index 021a653b973616985fd16dfb2df033637def6a5c..2c6cddbda1ff765cd042772d4a60d5f698906adb 100644 (file)
@@ -146,7 +146,7 @@ $(UCA_DATA) :
        mv core.zip common downloaded
        touch core.zip
 
-$(NORM_TEST) :
+$(NORM_TEST) : downloaded/NormalizationTest.txt
        wget wget http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
        mv NormalizationTest.txt downloaded/
        touch downloaded/NormalizationTest.txt
index 343b91e1fd087b1dd1de347e6863a4096ee00065..9f58ea84ddabba25596e93c60e2d5f6171991082 100644 (file)
@@ -73,7 +73,11 @@ namespace Mono.Globalization.Unicode
                        if (idx == 0)
                                return 0;
                        while (mappedChars [idx] == startCh) {
+                               int prevCB = 0;
+                               int combiningClass = 0;
                                for (int i = 1, j = 1; ; i++, j++) {
+                                       prevCB = combiningClass;
+
                                        if (mappedChars [idx + i] == 0)
                                                // matched
                                                return idx;
@@ -82,26 +86,31 @@ namespace Mono.Globalization.Unicode
 
                                        // handle blocked characters here.
                                        char curCh;
-                                       int combiningClass;
-                                       int nextCB = 0;
+                                       bool match = false;
                                        do {
                                                curCh = s != null ?
                                                        s [start + j] :
                                                        sb [start + j];
                                                combiningClass = GetCombiningClass (curCh);
-                                               if (++j + start >= charsLength ||
-                                                       combiningClass == 0)
+                                               if (mappedChars [idx + i] == curCh) {
+                                                       match = true;
                                                        break;
-                                               nextCB = GetCombiningClass (
-                                                       s != null ?
-                                                       s [start + j] :
-                                                       sb [start + j]);
-                                       } while (nextCB > 0 && combiningClass >= nextCB);
-                                       j--;
-                                       if (mappedChars [idx + i] == curCh)
-                                               continue;
-                                       if (mappedChars [idx + i] > curCh)
-                                               return 0; // no match
+                                               }
+                                               if (combiningClass < prevCB) // blocked. Give up this map entry.
+                                                       break;
+                                               if (++j + start >= charsLength || combiningClass == 0)
+                                                       break;
+                                       } while (true);
+
+                                       if (match)
+                                               continue; // check next character in the current map entry string.
+                                       if (prevCB < combiningClass) {
+                                               j--;
+                                               if (mappedChars [idx + i] == curCh)
+                                                       continue;
+                                               //if (mappedChars [idx + i] > curCh)
+                                               //      return 0; // no match
+                                       }
                                        // otherwise move idx to next item
                                        while (mappedChars [i] != 0)
                                                i++;
@@ -154,7 +163,7 @@ namespace Mono.Globalization.Unicode
                                int cur = i;
                                // FIXME: It should check "blocked" too
                                for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
-                                       if (!CanBePrimaryComposite ((int) sb [i]))
+                                       if (GetCombiningClass ((int) sb [i]) == 0)
                                                break;
 
                                int idx = 0; // index to mappedChars
@@ -167,29 +176,25 @@ namespace Mono.Globalization.Unicode
                                        i = cur;
                                        continue;
                                }
+
                                int prim = GetPrimaryCompositeFromMapIndex (idx);
                                int len = GetNormalizedStringLength (prim);
                                if (prim == 0 || len == 0)
-                                       throw new SystemException ("Internal error: should not happen.");
+                                       throw new SystemException ("Internal error: should not happen. Input: " + sb);
                                int removed = 0;
                                sb.Insert (i++, (char) prim); // always single character
 
                                // handle blocked characters here.
                                while (removed < len) {
-                                       if (i + 1 < sb.Length) {
-                                               int cb = GetCombiningClass (sb [i]);
-                                               if (cb > 0) {
-                                                       int next = GetCombiningClass (sb [i + 1]);
-                                                       if (next != 0 && cb >= next) {
-                                                               i++;
-                                                               continue;
-                                                       }
-                                               }
+                                       if (sb [i] == mappedChars [idx + removed]) {
+                                               sb.Remove (i, 1);
+                                               removed++;
+                                               // otherwise, skip it.
                                        }
-                                       sb.Remove (i, 1);
-                                       removed++;
+                                       else
+                                               i++;
                                }
-                               i = cur - 1; // apply recursively
+                               i = cur - 1;
                        }
                }
 
@@ -364,11 +369,10 @@ namespace Mono.Globalization.Unicode
                                        
                                        // partly copied from Combine()
                                        int cur = i;
-                                       // FIXME: It should check "blocked" too
-                                       for (;i >= 0; i--)
-                                               if (!CanBePrimaryComposite ((int) source [i]))
+                                       for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
+                                               if (GetCombiningClass ((int) source [i]) == 0)
                                                        break;
-                                       i++;
+                                       //i++;
                                        // Now i is the "starter"
                                        for (; i < cur; i++) {
                                                if (GetPrimaryCompositeCharIndex (source, i) != 0)
index 0f473177210a59cf2e435283cd854066ecc835f7..e63cb0f9a97c4054a09194233c95f69782265588 100644 (file)
@@ -1,3 +1,7 @@
+2009-09-18  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * StringTest.cs : add more normalization tests.
+
 2009-08-26  Sebastien Pouliot  <sebastien@ximian.com>
 
        * TimeSpanTest.cs: Add test case where seconds are not specified.
index 681295bcfe25adecb7da396711b9cdd17de8c13d..20047644bee5e92720bfd899285b7346442a51fc 100644 (file)
@@ -4228,6 +4228,22 @@ public class StringTest
                Assert.AreEqual (s2, s1.Normalize (NormalizationForm.FormC), "#1");
                Assert.AreEqual (s2, s1.Normalize (NormalizationForm.FormKC), "#2");
        }
+
+       [Test]
+       public void Normalize3 ()
+       {
+               var s = new string (new char [] { '\u064A', '\u064F', '\u0648', '\u0654', '\u0652', '\u064A', '\u064F', '\u0648', '\u0654' });
+
+               var formC = new string (new char [] { '\u064A', '\u064F', '\u0624', '\u0652', '\u064a', '\u064f', '\u0624' });
+               var formD = new string (new char [] { '\u064A', '\u064F', '\u0648', '\u0652', '\u0654', '\u064a', '\u064f', '\u0648', '\u0654' });
+               var formKC = new string (new char [] { '\u064A', '\u064F', '\u0624', '\u0652', '\u064a', '\u064f', '\u0624' });
+               var formKD = new string (new char [] { '\u064A', '\u064F', '\u0648', '\u0652', '\u0654', '\u064a', '\u064f', '\u0648', '\u0654' });
+
+               Assert.AreEqual (formD, s.Normalize (NormalizationForm.FormD), "#1");
+               Assert.AreEqual (formC, s.Normalize (NormalizationForm.FormC), "#2");
+               Assert.AreEqual (formKD, s.Normalize (NormalizationForm.FormKD), "#3");
+               Assert.AreEqual (formKC, s.Normalize (NormalizationForm.FormKC), "#4");
+       }
 #endif
 }