return charMapIndex [NUtil.MapIdx (cp)];
}
- static int GetComposedStringLength (int ch)
+ static int GetNormalizedStringLength (int ch)
{
int start = charMapIndex [NUtil.MapIdx (ch)];
int i = start;
if (!CanBePrimaryComposite ((int) sb [i]))
break;
- int idx = 0;
+ int idx = 0; // index to mappedChars
for (; i < cur; i++) {
idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
if (idx > 0)
i = cur;
continue;
}
- int ch = GetPrimaryCompositeFromMapIndex (idx);
- int len = GetComposedStringLength (ch);
- if (ch == 0 || len == 0)
+ int prim = GetPrimaryCompositeFromMapIndex (idx);
+ int len = GetNormalizedStringLength (prim);
+ if (prim == 0 || len == 0)
throw new SystemException ("Internal error: should not happen.");
int removed = 0;
- sb.Insert (i++, (char) ch); // always single character
+ sb.Insert (i++, (char) prim); // always single character
// handle blocked characters here.
while (removed < len) {
--- /dev/null
+* Normalization implementation notes
+
+** Basics
+
+ Unicode normalization is implemented as String.Normalize(), which
+ supports all of FormD, FormC, FormKD and FormKC.
+
+ FormD and FormKD decompose the input string.
+ FormC and FormKC combine the decomposed input string.
+
+ Mono's Unicode Normalization methods are implemented in
+ Mono.Globalization.Unicode.Normalization.
+
+*** Normalization array resources
+
+ The Normalization implementation involves a lot of array lookup
+ which mostly represent UCD (Unicode Character Data) which is
+ essential to Unicode Normalization.
+
+ By default (in the release), the arrays are defined as C array and
+ then loaded via icalls (see the static constructor). Defined in
+ normalization-table.h.
+
+ Alternatively, for debugging purpose, you can switch to managed array
+ lookup instead. The arrays are then defined in
+ NormalizationGenerated.cs.
+
+ Both .h and -Generated.cs files can be generated by running
+ create-normalization-source.exe, which reads UCD and emits them.
+
+ There are 6 arrays in our implementation. Each array is of [size]:
+
+ - byte props [char.MaxValue]:
+ Stores "properties" for each character, where the "properties"
+ are dedicated set of the properties for normalization as defined
+ in "DerivedNormalizationProps.txt".
+ It is used for quick check (NF*_QC) etc.
+
+ - int mappedChars []:
+ Stores all the normalized strings in the mapping entries expanded
+ as an array of chars. Element at 0 is 0. Each of the strings is
+ NULL-terminated (ends with 0). The entries are sorted first in the
+ order of the primary composite (source) char, and second in the
+ order of the normalized string.
+
+ For example, if the length of the normalized string of the first
+ mapping entry is 2, then [1] holds the first character of the
+ normalized string of the first mapping entry. [2] holds the second
+ character of the normalized string of the first mapping entry.
+
+ - short charMapIndex [char.MaxValue]:
+ Stores the indexes to the mapping for each primary composite (source)
+ Unicode character. If there is no mapping for the character, then
+ the index value is 0.
+
+ Note that mapping information is not directly stored in any of the
+ arrays.
+
+ example:
+ mappedChars: [A1, A2, B1, C1, C2, D1, D2, D3, E1]
+ charMapIndex: [0, 2, 3, 5, 8]
+
+ - short helperIndex [char.MaxValue]
+ Stores the index to mappedChars of the first character of the
+ first entry of the normalized strings for each character (note
+ that it is *not* map from primary composite but from head of
+ normalized strings).
+ If there is no mapping for the character, then 0 is returned.
+
+ - ushort mapIdxToComposite [maps.Length]:
+ Stores the primary composite (source) character for each mapping,
+ where the key is the index to mappedChars.
+ It is a "reversed" charMapIndex array (which is char-to-mapidx).
+
+ example: char src = (char) mapIdxToComposite [mapIdx];
+
+ - byte combiningClass [char.MaxValue]:
+ Stores the UCD CombiningClass value for each Unicode character.
+