2009-09-17 Atsushi Enomoto <atsushi@ximian.com>
authorAtsushi Eno <atsushieno@gmail.com>
Fri, 18 Sep 2009 17:07:51 +0000 (17:07 -0000)
committerAtsushi Eno <atsushieno@gmail.com>
Fri, 18 Sep 2009 17:07:51 +0000 (17:07 -0000)
* Normalization.cs : some renaming for disambiguation.
* NormalizationTableUtil.cs : fix some wrong ranges in
  mapIdxToComposite. This fixes some Arabic normalization (and more).
* normalization-notes.txt : added some notes on the implementation.

svn path=/trunk/mcs/; revision=142211

mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs
mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt [new file with mode: 0644]

index 1c7b2f7fb3bd8202807f3c3b61df5a1c33c763e8..79166d88b2026b568d01bd557e4480aad343bce6 100644 (file)
@@ -1,3 +1,10 @@
+2009-09-17  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * Normalization.cs : some renaming for disambiguation.
+       * NormalizationTableUtil.cs : fix some wrong ranges in
+         mapIdxToComposite. This fixes some Arabic normalization (and more).
+       * normalization-notes.txt : added some notes on the implementation.
+
 2008-06-19  Atsushi Enomoto  <atsushi@ximian.com>
 
        * Normalization.cs :
index 2da5ef144e9beca01f39710c0505ce188094e34b..343b91e1fd087b1dd1de347e6863a4096ee00065 100644 (file)
@@ -38,7 +38,7 @@ namespace Mono.Globalization.Unicode
                        return charMapIndex [NUtil.MapIdx (cp)];
                }
 
-               static int GetComposedStringLength (int ch)
+               static int GetNormalizedStringLength (int ch)
                {
                        int start = charMapIndex [NUtil.MapIdx (ch)];
                        int i = start;
@@ -157,7 +157,7 @@ namespace Mono.Globalization.Unicode
                                        if (!CanBePrimaryComposite ((int) sb [i]))
                                                break;
 
-                               int idx = 0;
+                               int idx = 0; // index to mappedChars
                                for (; i < cur; i++) {
                                        idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
                                        if (idx > 0)
@@ -167,12 +167,12 @@ namespace Mono.Globalization.Unicode
                                        i = cur;
                                        continue;
                                }
-                               int ch = GetPrimaryCompositeFromMapIndex (idx);
-                               int len = GetComposedStringLength (ch);
-                               if (ch == 0 || len == 0)
+                               int prim = GetPrimaryCompositeFromMapIndex (idx);
+                               int len = GetNormalizedStringLength (prim);
+                               if (prim == 0 || len == 0)
                                        throw new SystemException ("Internal error: should not happen.");
                                int removed = 0;
-                               sb.Insert (i++, (char) ch); // always single character
+                               sb.Insert (i++, (char) prim); // always single character
 
                                // handle blocked characters here.
                                while (removed < len) {
index 38e47f94fdfc813cce2e7469fafb27658ab9b21a..cde020a41b92ddf822b997c44ca1216c645a7c31 100644 (file)
@@ -55,10 +55,10 @@ namespace Mono.Globalization.Unicode
                        // since mapToCompositeIndex only holds canonical
                        // mappings, those indexes could be still shorten.
                        int [] compositeStarts = new int [] {
-                               0x480, 0x1450, 0x16D0
+                               0x480, 0x1410, 0x1670
                                };
                        int [] compositeEnds = new int [] {
-                               0x10C0, 0x15D0, 0x2190
+                               0x1080, 0x1580, 0x21B0
                                };
                        int [] helperStarts = new int [] {
                                0, 0x900, 0x1D00, 0x2500, 0x3000, 0x3B90,
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt
new file mode 100644 (file)
index 0000000..4421352
--- /dev/null
@@ -0,0 +1,79 @@
+* Normalization implementation notes
+
+** Basics
+
+       Unicode normalization is implemented as String.Normalize(), which
+       supports all of FormD, FormC, FormKD and FormKC.
+
+       FormD and FormKD decompose the input string.
+       FormC and FormKC combine the decomposed input string.
+
+       Mono's Unicode Normalization methods are implemented in
+       Mono.Globalization.Unicode.Normalization.
+
+*** Normalization array resources
+
+       The Normalization implementation involves a lot of array lookup
+       which mostly represent UCD (Unicode Character Data) which is
+       essential to Unicode Normalization.
+
+       By default (in the release), the arrays are defined as C array and
+       then loaded via icalls (see the static constructor). Defined in
+       normalization-table.h.
+
+       Alternatively, for debugging purpose, you can switch to managed array
+       lookup instead. The arrays are then defined in
+       NormalizationGenerated.cs.
+
+       Both .h and -Generated.cs files can be generated by running
+       create-normalization-source.exe, which reads UCD and emits them.
+
+       There are 6 arrays in our implementation. Each array is of [size]:
+
+       - byte props [char.MaxValue]:
+         Stores "properties" for each character, where the "properties"
+         are dedicated set of the properties for normalization as defined
+         in "DerivedNormalizationProps.txt".
+         It is used for quick check (NF*_QC) etc.
+
+       - int mappedChars []:
+         Stores all the normalized strings in the mapping entries expanded
+         as an array of chars. Element at 0 is 0. Each of the strings is
+         NULL-terminated (ends with 0). The entries are sorted first in the
+         order of the primary composite (source) char, and second in the
+         order of the normalized string.
+
+         For example, if the length of the normalized string of the first
+         mapping entry is 2, then [1] holds the first character of the
+         normalized string of the first mapping entry. [2] holds the second
+         character of the normalized string of the first mapping entry.
+
+       - short charMapIndex [char.MaxValue]:
+         Stores the indexes to the mapping for each primary composite (source)
+         Unicode character. If there is no mapping for the character, then
+         the index value is 0.
+
+         Note that mapping information is not directly stored in any of the
+         arrays.
+
+         example:
+                 mappedChars: [A1, A2, B1, C1, C2, D1, D2, D3, E1]
+                 charMapIndex: [0, 2, 3, 5, 8]
+
+       - short helperIndex [char.MaxValue]
+         Stores the index to mappedChars of the first character of the
+         first entry of the normalized strings for each character (note
+         that it is *not* map from primary composite but from head of
+         normalized strings).
+         If there is no mapping for the character, then 0 is returned.
+
+       - ushort mapIdxToComposite [maps.Length]:
+         Stores the primary composite (source) character for each mapping,
+         where the key is the index to mappedChars.
+         It is a "reversed" charMapIndex array (which is char-to-mapidx).
+
+         example: char src = (char) mapIdxToComposite [mapIdx];
+
+       - byte combiningClass [char.MaxValue]:
+         Stores the UCD CombiningClass value for each Unicode character.
+