From: Atsushi Eno <atsushieno@gmail.com>
Date: Fri, 18 Sep 2009 17:07:51 +0000 (-0000)
Subject: 2009-09-17  Atsushi Enomoto  <atsushi@ximian.com>
X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=commitdiff_plain;h=09acefa4596526d1a14b0182888d5d3ded291b5f;p=mono.git

2009-09-17  Atsushi Enomoto  <atsushi@ximian.com>

	* Normalization.cs : some renaming for disambiguation.
	* NormalizationTableUtil.cs : fix some wrong ranges in
	  mapIdxToComposite. This fixes some Arabic normalization (and more).
	* normalization-notes.txt : added some notes on the implementation.


svn path=/trunk/mcs/; revision=142211
---

diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
index 1c7b2f7fb3b..79166d88b20 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,10 @@
+2009-09-17  Atsushi Enomoto  <atsushi@ximian.com>
+
+	* Normalization.cs : some renaming for disambiguation.
+	* NormalizationTableUtil.cs : fix some wrong ranges in
+	  mapIdxToComposite. This fixes some Arabic normalization (and more).
+	* normalization-notes.txt : added some notes on the implementation.
+
 2008-06-19  Atsushi Enomoto  <atsushi@ximian.com>
 
 	* Normalization.cs :
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs b/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
index 2da5ef144e9..343b91e1fd0 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs
@@ -38,7 +38,7 @@ namespace Mono.Globalization.Unicode
 			return charMapIndex [NUtil.MapIdx (cp)];
 		}
 
-		static int GetComposedStringLength (int ch)
+		static int GetNormalizedStringLength (int ch)
 		{
 			int start = charMapIndex [NUtil.MapIdx (ch)];
 			int i = start;
@@ -157,7 +157,7 @@ namespace Mono.Globalization.Unicode
 					if (!CanBePrimaryComposite ((int) sb [i]))
 						break;
 
-				int idx = 0;
+				int idx = 0; // index to mappedChars
 				for (; i < cur; i++) {
 					idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
 					if (idx > 0)
@@ -167,12 +167,12 @@ namespace Mono.Globalization.Unicode
 					i = cur;
 					continue;
 				}
-				int ch = GetPrimaryCompositeFromMapIndex (idx);
-				int len = GetComposedStringLength (ch);
-				if (ch == 0 || len == 0)
+				int prim = GetPrimaryCompositeFromMapIndex (idx);
+				int len = GetNormalizedStringLength (prim);
+				if (prim == 0 || len == 0)
 					throw new SystemException ("Internal error: should not happen.");
 				int removed = 0;
-				sb.Insert (i++, (char) ch); // always single character
+				sb.Insert (i++, (char) prim); // always single character
 
 				// handle blocked characters here.
 				while (removed < len) {
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs b/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs
index 38e47f94fdf..cde020a41b9 100644
--- a/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs
@@ -55,10 +55,10 @@ namespace Mono.Globalization.Unicode
 			// since mapToCompositeIndex only holds canonical
 			// mappings, those indexes could be still shorten.
 			int [] compositeStarts = new int [] {
-				0x480, 0x1450, 0x16D0
+				0x480, 0x1410, 0x1670
 				};
 			int [] compositeEnds = new int [] {
-				0x10C0, 0x15D0, 0x2190
+				0x1080, 0x1580, 0x21B0
 				};
 			int [] helperStarts = new int [] {
 				0, 0x900, 0x1D00, 0x2500, 0x3000, 0x3B90,
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt
new file mode 100644
index 00000000000..44213525afd
--- /dev/null
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt
@@ -0,0 +1,79 @@
+* Normalization implementation notes
+
+** Basics
+
+	Unicode normalization is implemented as String.Normalize(), which
+	supports all of FormD, FormC, FormKD and FormKC.
+
+	FormD and FormKD decompose the input string.
+	FormC and FormKC combine the decomposed input string.
+
+	Mono's Unicode Normalization methods are implemented in
+	Mono.Globalization.Unicode.Normalization.
+
+*** Normalization array resources
+
+	The Normalization implementation involves a lot of array lookup
+	which mostly represent UCD (Unicode Character Data) which is
+	essential to Unicode Normalization.
+
+	By default (in the release), the arrays are defined as C array and
+	then loaded via icalls (see the static constructor). Defined in
+	normalization-table.h.
+
+	Alternatively, for debugging purpose, you can switch to managed array
+	lookup instead. The arrays are then defined in
+	NormalizationGenerated.cs.
+
+	Both .h and -Generated.cs files can be generated by running
+	create-normalization-source.exe, which reads UCD and emits them.
+
+	There are 6 arrays in our implementation. Each array is of [size]:
+
+	- byte props [char.MaxValue]:
+	  Stores "properties" for each character, where the "properties"
+	  are dedicated set of the properties for normalization as defined
+	  in "DerivedNormalizationProps.txt".
+	  It is used for quick check (NF*_QC) etc.
+
+	- int mappedChars []:
+	  Stores all the normalized strings in the mapping entries expanded
+	  as an array of chars. Element at 0 is 0. Each of the strings is
+	  NULL-terminated (ends with 0). The entries are sorted first in the
+	  order of the primary composite (source) char, and second in the
+	  order of the normalized string.
+
+	  For example, if the length of the normalized string of the first
+	  mapping entry is 2, then [1] holds the first character of the
+	  normalized string of the first mapping entry. [2] holds the second
+	  character of the normalized string of the first mapping entry.
+
+	- short charMapIndex [char.MaxValue]:
+	  Stores the indexes to the mapping for each primary composite (source)
+	  Unicode character. If there is no mapping for the character, then
+	  the index value is 0.
+
+	  Note that mapping information is not directly stored in any of the
+	  arrays.
+
+	  example:
+		  mappedChars: [A1, A2, B1, C1, C2, D1, D2, D3, E1]
+		  charMapIndex: [0, 2, 3, 5, 8]
+
+	- short helperIndex [char.MaxValue]
+	  Stores the index to mappedChars of the first character of the
+	  first entry of the normalized strings for each character (note
+	  that it is *not* map from primary composite but from head of
+	  normalized strings).
+	  If there is no mapping for the character, then 0 is returned.
+
+	- ushort mapIdxToComposite [maps.Length]:
+	  Stores the primary composite (source) character for each mapping,
+	  where the key is the index to mappedChars.
+	  It is a "reversed" charMapIndex array (which is char-to-mapidx).
+
+	  example: char src = (char) mapIdxToComposite [mapIdx];
+
+	- byte combiningClass [char.MaxValue]:
+	  Stores the UCD CombiningClass value for each Unicode character.
+