From: Atsushi Eno Date: Fri, 18 Sep 2009 17:07:51 +0000 (-0000) Subject: 2009-09-17 Atsushi Enomoto X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=commitdiff_plain;h=09acefa4596526d1a14b0182888d5d3ded291b5f;p=mono.git 2009-09-17 Atsushi Enomoto * Normalization.cs : some renaming for disambiguation. * NormalizationTableUtil.cs : fix some wrong ranges in mapIdxToComposite. This fixes some Arabic normalization (and more). * normalization-notes.txt : added some notes on the implementation. svn path=/trunk/mcs/; revision=142211 --- diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog index 1c7b2f7fb3b..79166d88b20 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog +++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog @@ -1,3 +1,10 @@ +2009-09-17 Atsushi Enomoto + + * Normalization.cs : some renaming for disambiguation. + * NormalizationTableUtil.cs : fix some wrong ranges in + mapIdxToComposite. This fixes some Arabic normalization (and more). + * normalization-notes.txt : added some notes on the implementation. + 2008-06-19 Atsushi Enomoto * Normalization.cs : diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs b/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs index 2da5ef144e9..343b91e1fd0 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/Normalization.cs @@ -38,7 +38,7 @@ namespace Mono.Globalization.Unicode return charMapIndex [NUtil.MapIdx (cp)]; } - static int GetComposedStringLength (int ch) + static int GetNormalizedStringLength (int ch) { int start = charMapIndex [NUtil.MapIdx (ch)]; int i = start; @@ -157,7 +157,7 @@ namespace Mono.Globalization.Unicode if (!CanBePrimaryComposite ((int) sb [i])) break; - int idx = 0; + int idx = 0; // index to mappedChars for (; i < cur; i++) { idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i); if (idx > 0) @@ -167,12 +167,12 @@ namespace Mono.Globalization.Unicode i = cur; continue; } - int ch = GetPrimaryCompositeFromMapIndex (idx); - int len = GetComposedStringLength (ch); - if (ch == 0 || len == 0) + int prim = GetPrimaryCompositeFromMapIndex (idx); + int len = GetNormalizedStringLength (prim); + if (prim == 0 || len == 0) throw new SystemException ("Internal error: should not happen."); int removed = 0; - sb.Insert (i++, (char) ch); // always single character + sb.Insert (i++, (char) prim); // always single character // handle blocked characters here. while (removed < len) { diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs b/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs index 38e47f94fdf..cde020a41b9 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/NormalizationTableUtil.cs @@ -55,10 +55,10 @@ namespace Mono.Globalization.Unicode // since mapToCompositeIndex only holds canonical // mappings, those indexes could be still shorten. int [] compositeStarts = new int [] { - 0x480, 0x1450, 0x16D0 + 0x480, 0x1410, 0x1670 }; int [] compositeEnds = new int [] { - 0x10C0, 0x15D0, 0x2190 + 0x1080, 0x1580, 0x21B0 }; int [] helperStarts = new int [] { 0, 0x900, 0x1D00, 0x2500, 0x3000, 0x3B90, diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt new file mode 100644 index 00000000000..44213525afd --- /dev/null +++ b/mcs/class/corlib/Mono.Globalization.Unicode/normalization-notes.txt @@ -0,0 +1,79 @@ +* Normalization implementation notes + +** Basics + + Unicode normalization is implemented as String.Normalize(), which + supports all of FormD, FormC, FormKD and FormKC. + + FormD and FormKD decompose the input string. + FormC and FormKC combine the decomposed input string. + + Mono's Unicode Normalization methods are implemented in + Mono.Globalization.Unicode.Normalization. + +*** Normalization array resources + + The Normalization implementation involves a lot of array lookup + which mostly represent UCD (Unicode Character Data) which is + essential to Unicode Normalization. + + By default (in the release), the arrays are defined as C array and + then loaded via icalls (see the static constructor). Defined in + normalization-table.h. + + Alternatively, for debugging purpose, you can switch to managed array + lookup instead. The arrays are then defined in + NormalizationGenerated.cs. + + Both .h and -Generated.cs files can be generated by running + create-normalization-source.exe, which reads UCD and emits them. + + There are 6 arrays in our implementation. Each array is of [size]: + + - byte props [char.MaxValue]: + Stores "properties" for each character, where the "properties" + are dedicated set of the properties for normalization as defined + in "DerivedNormalizationProps.txt". + It is used for quick check (NF*_QC) etc. + + - int mappedChars []: + Stores all the normalized strings in the mapping entries expanded + as an array of chars. Element at 0 is 0. Each of the strings is + NULL-terminated (ends with 0). The entries are sorted first in the + order of the primary composite (source) char, and second in the + order of the normalized string. + + For example, if the length of the normalized string of the first + mapping entry is 2, then [1] holds the first character of the + normalized string of the first mapping entry. [2] holds the second + character of the normalized string of the first mapping entry. + + - short charMapIndex [char.MaxValue]: + Stores the indexes to the mapping for each primary composite (source) + Unicode character. If there is no mapping for the character, then + the index value is 0. + + Note that mapping information is not directly stored in any of the + arrays. + + example: + mappedChars: [A1, A2, B1, C1, C2, D1, D2, D3, E1] + charMapIndex: [0, 2, 3, 5, 8] + + - short helperIndex [char.MaxValue] + Stores the index to mappedChars of the first character of the + first entry of the normalized strings for each character (note + that it is *not* map from primary composite but from head of + normalized strings). + If there is no mapping for the character, then 0 is returned. + + - ushort mapIdxToComposite [maps.Length]: + Stores the primary composite (source) character for each mapping, + where the key is the index to mappedChars. + It is a "reversed" charMapIndex array (which is char-to-mapidx). + + example: char src = (char) mapIdxToComposite [mapIdx]; + + - byte combiningClass [char.MaxValue]: + Stores the UCD CombiningClass value for each Unicode character. +