Merge pull request #1659 from alexanderkyte/stringbuilder-referencesource
[mono.git] / mcs / class / corlib / System.Globalization / TextInfo.cs
1 //
2 // System.Globalization.TextInfo.cs
3 //
4 // Authors:
5 //      Dick Porter (dick@ximian.com)
6 //      Duncan Mak (duncan@ximian.com)
7 //      Atsushi Enomoto (atsushi@ximian.com)
8 //      Sebastien Pouliot  <sebastien@ximian.com>
9 //
10 // (C) 2002 Ximian, Inc.
11 // (C) 2005 Novell, Inc.
12 //
13 // TODO:
14 //   Missing the various code page mappings.
15 //   Missing the OnDeserialization implementation.
16 //
17 // Copyright (C) 2004, 2005 Novell, Inc (http://www.novell.com)
18 //
19 // Permission is hereby granted, free of charge, to any person obtaining
20 // a copy of this software and associated documentation files (the
21 // "Software"), to deal in the Software without restriction, including
22 // without limitation the rights to use, copy, modify, merge, publish,
23 // distribute, sublicense, and/or sell copies of the Software, and to
24 // permit persons to whom the Software is furnished to do so, subject to
25 // the following conditions:
26 // 
27 // The above copyright notice and this permission notice shall be
28 // included in all copies or substantial portions of the Software.
29 // 
30 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
34 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
35 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
36 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 //
38
39 using System.Runtime.CompilerServices;
40 using System.Runtime.Serialization;
41 using System.Runtime.InteropServices;
42 using System.Text;
43 using System.Diagnostics.Contracts;
44
45 namespace System.Globalization {
46
47         [Serializable]
48         [ComVisible (true)]
49         [MonoTODO ("IDeserializationCallback isn't implemented.")]
50         public class TextInfo: IDeserializationCallback, ICloneable
51         {
52                 static TextInfo ()
53                 {
54                         unsafe {
55                                 GetDataTablePointersLite (out to_lower_data_low, out to_lower_data_high, out to_upper_data_low, out to_upper_data_high);
56                         }
57                 }
58                 
59                 private readonly unsafe static ushort *to_lower_data_low;
60                 private readonly unsafe static ushort *to_lower_data_high;
61                 private readonly unsafe static ushort *to_upper_data_low;
62                 private readonly unsafe static ushort *to_upper_data_high;
63                 [MethodImplAttribute(System.Runtime.CompilerServices.MethodImplOptions.InternalCall)]
64                 private unsafe static extern void GetDataTablePointersLite (out ushort *to_lower_data_low, out ushort *to_lower_data_high,
65                         out ushort *to_upper_data_low, out ushort *to_upper_data_high);
66
67                 static char ToLowerInvariant (char c)
68                 {
69                         unsafe {
70                                 if (c <= ((char)0x24cf))
71                                         return (char) to_lower_data_low [c];
72                                 if (c >= ((char)0xff21))
73                                         return (char) to_lower_data_high[c - 0xff21];
74                         }
75                         return c;
76                 }
77
78                 static char ToUpperInvariant (char c)
79                 {
80                         unsafe {
81                                 if (c <= ((char)0x24e9))
82                                         return (char) to_upper_data_low [c];
83                                 if (c >= ((char)0xff21))
84                                         return (char) to_upper_data_high [c - 0xff21];
85                         }
86                         return c;
87                 }
88                 
89                 [StructLayout (LayoutKind.Sequential)]
90                 struct Data {
91                         public int ansi;
92                         public int ebcdic;
93                         public int mac;
94                         public int oem;
95                         public bool right_to_left;
96                         public byte list_sep;
97                 }
98
99                 string m_listSeparator;
100                 bool m_isReadOnly;
101                 string customCultureName;
102
103 #pragma warning disable 169
104                 [NonSerialized]
105                 int m_nDataItem;
106                 bool m_useUserOverride;
107 #pragma warning restore 169             
108
109                 int m_win32LangID;
110
111                 [NonSerialized]
112                 readonly CultureInfo ci;
113
114                 [NonSerialized]
115                 readonly bool handleDotI;
116
117                 [NonSerialized]
118                 readonly Data data;
119
120                 internal unsafe TextInfo (CultureInfo ci, int lcid, void* data, bool read_only)
121                 {
122                         this.m_isReadOnly = read_only;
123                         this.m_win32LangID = lcid;
124                         this.ci = ci;
125                         if (data != null)
126                                 this.data = *(Data*) data;
127                         else {
128                                 this.data = new Data ();
129                                 this.data.list_sep = (byte) ',';
130                         }
131
132                         CultureInfo tmp = ci;
133                         while (tmp.Parent != null && tmp.Parent.LCID != 0x7F && tmp.Parent != tmp)
134                                 tmp = tmp.Parent;
135
136                         if (tmp != null) {
137                                 switch (tmp.LCID) {
138                                 case 44: // Azeri (az)
139                                 case 31: // Turkish (tr)
140                                         handleDotI = true;
141                                         break;
142                                 }
143                         }
144                 }
145
146                 private TextInfo (TextInfo textInfo)
147                 {
148                         m_win32LangID = textInfo.m_win32LangID;
149                         m_nDataItem = textInfo.m_nDataItem;
150                         m_useUserOverride = textInfo.m_useUserOverride;
151                         m_listSeparator = textInfo.ListSeparator;
152                         customCultureName = textInfo.CultureName;
153                         ci = textInfo.ci;
154                         handleDotI = textInfo.handleDotI;
155                         data = textInfo.data;
156                 }
157
158                 public virtual int ANSICodePage
159                 {
160                         get {
161                                 return data.ansi;
162                         }
163                 }
164
165                 public virtual int EBCDICCodePage
166                 {
167                         get {
168                                 return data.ebcdic;
169                         }
170                 }
171
172                 [ComVisible (false)]
173                 public int LCID {
174                         get { return m_win32LangID; }
175                 }
176
177                 public virtual string ListSeparator {
178                         get {
179                                 if (m_listSeparator == null)
180                                         m_listSeparator = ((char) data.list_sep).ToString ();
181                                 return m_listSeparator;
182                         }
183                         [ComVisible (false)]
184                         set { m_listSeparator = value; }
185                 }
186
187                 public virtual int MacCodePage
188                 {
189                         get {
190                                 return data.mac;
191                         }
192                 }
193
194                 public virtual int OEMCodePage
195                 {
196                         get {
197                                 return data.oem;
198                         }
199                 }
200
201                 [ComVisible (false)]
202                 public string CultureName {
203                         get {
204                                 if (customCultureName == null)
205                                         customCultureName = ci == null ? String.Empty : ci.Name;
206                                 return customCultureName;
207                         }
208                 }
209
210                 [ComVisible (false)]
211                 public bool IsReadOnly {
212                         get { return m_isReadOnly; }
213                 }
214
215                 [ComVisible (false)]
216                 public bool IsRightToLeft {
217                         get {
218                                 return data.right_to_left;
219                         }
220                 }
221
222                 public override bool Equals (object obj)
223                 {
224                         if (obj == null)
225                                 return false;
226                         TextInfo other = obj as TextInfo;
227                         if (other == null)
228                                 return false;
229                         if (other.m_win32LangID != m_win32LangID)
230                                 return false;
231                         if (other.ci != ci)
232                                 return false;
233                         return true;
234                 }
235
236                 public override int GetHashCode()
237                 {
238                         return (m_win32LangID);
239                 }
240                 
241                 public override string ToString()
242                 {
243                         return "TextInfo - " + m_win32LangID;
244                 }
245
246                 public string ToTitleCase (string str)
247                 {
248                         if(str == null)
249                                 throw new ArgumentNullException ("str");
250
251                         StringBuilder sb = null;
252                         int i = 0;
253                         int start = 0;
254                         while (i < str.Length) {
255                                 if (!Char.IsLetter (str [i++]))
256                                         continue;
257                                 i--;
258                                 char t = ToTitleCase (str [i]);
259                                 bool capitalize = true;
260                                 if (t == str [i]) {
261                                         capitalize = false;
262                                         bool allTitle = true;
263                                         // if the word is all titlecase,
264                                         // then don't capitalize it.
265                                         int saved = i;
266                                         while (++i < str.Length) {
267                                                 var ch = str [i];
268                                                 var category = char.GetUnicodeCategory (ch);
269                                                 if (IsSeparator (category))
270                                                         break;
271                                                 t = ToTitleCase (ch);
272                                                 if (t != ch) {
273                                                         allTitle = false;
274                                                         break;
275                                                 }
276                                         }
277                                         if (allTitle)
278                                                 continue;
279                                         i = saved;
280
281                                         // still check if all remaining
282                                         // characters are lowercase,
283                                         // where we don't have to modify
284                                         // the source word.
285                                         while (++i < str.Length) {
286                                                 var ch = str [i];
287                                                 var category = char.GetUnicodeCategory (ch);
288                                                 if (IsSeparator (category))
289                                                         break;
290                                                 if (ToLower (ch) != ch) {
291                                                         capitalize = true;
292                                                         i = saved;
293                                                         break;
294                                                 }
295                                         }
296                                 }
297
298                                 if (capitalize) {
299                                         if (sb == null)
300                                                 sb = new StringBuilder (str.Length);
301                                         sb.Append (str, start, i - start);
302                                         sb.Append (ToTitleCase (str [i]));
303                                         start = i + 1;
304                                         while (++i < str.Length) {
305                                                 var ch = str [i];
306                                                 var category = char.GetUnicodeCategory (ch);
307                                                 if (IsSeparator (category))
308                                                         break;
309                                                 sb.Append (ToLower (ch));
310                                         }
311                                         start = i;
312                                 }
313                         }
314                         if (sb != null)
315                                 sb.Append (str, start, str.Length - start);
316
317                         return sb != null ? sb.ToString () : str;
318                 }
319
320                 static bool IsSeparator (UnicodeCategory category)
321                 {
322                         switch (category) {
323                         case UnicodeCategory.SpaceSeparator:
324                         case UnicodeCategory.LineSeparator:
325                         case UnicodeCategory.ParagraphSeparator:
326                         case UnicodeCategory.Control:
327                         case UnicodeCategory.Format:
328                         case UnicodeCategory.ConnectorPunctuation:
329                         case UnicodeCategory.DashPunctuation:
330                         case UnicodeCategory.OpenPunctuation:
331                         case UnicodeCategory.ClosePunctuation:
332                         case UnicodeCategory.InitialQuotePunctuation:
333                         case UnicodeCategory.FinalQuotePunctuation:
334                         case UnicodeCategory.OtherPunctuation:
335                                 return true;
336                         }
337
338                         return false;
339                 }
340
341                 // Only Azeri and Turkish have their own special cases.
342                 // Other than them, all languages have common special case
343                 // (enumerable enough).
344                 public virtual char ToLower (char c)
345                 {
346                         // quick ASCII range check
347                         if (c < 0x40 || 0x60 < c && c < 128)
348                                 return c;
349                         else if ('A' <= c && c <= 'Z' && (!handleDotI || c != 'I'))
350                                 return (char) (c + 0x20);
351
352                         if (ci == null || ci.LCID == 0x7F)
353                                 return ToLowerInvariant (c);
354
355                         switch (c) {
356                         case '\u0049': // Latin uppercase I
357                                 if (handleDotI)
358                                         return '\u0131'; // I becomes dotless i
359                                 break;
360                         case '\u0130': // I-dotted
361                                 return '\u0069'; // i
362
363                         case '\u01c5': // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
364                                 return '\u01c6';
365                         // \u01c7 -> \u01c9 (LJ) : invariant
366                         case '\u01c8': // LATIN CAPITAL LETTER L WITH SMALL LETTER J
367                                 return '\u01c9';
368                         // \u01ca -> \u01cc (NJ) : invariant
369                         case '\u01cb': // LATIN CAPITAL LETTER N WITH SMALL LETTER J
370                                 return '\u01cc';
371                         // WITH CARON : invariant
372                         // WITH DIAERESIS AND * : invariant
373
374                         case '\u01f2': // LATIN CAPITAL LETTER D WITH SMALL LETTER Z
375                                 return '\u01f3';
376                         case '\u03d2':  // ? it is not in ICU
377                                 return '\u03c5';
378                         case '\u03d3':  // ? it is not in ICU
379                                 return '\u03cd';
380                         case '\u03d4':  // ? it is not in ICU
381                                 return '\u03cb';
382                         }
383                         return ToLowerInvariant (c);
384                 }
385
386                 public virtual char ToUpper (char c)
387                 {
388                         // quick ASCII range check
389                         if (c < 0x60)
390                                 return c;
391                         else if ('a' <= c && c <= 'z' && (!handleDotI || c != 'i'))
392                                 return (char) (c - 0x20);
393
394                         if (ci == null || ci.LCID == 0x7F)
395                                 return ToUpperInvariant (c);
396
397                         switch (c) {
398                         case '\u0069': // Latin lowercase i
399                                 if (handleDotI)
400                                         return '\u0130'; // dotted capital I
401                                 break;
402                         case '\u0131': // dotless i
403                                 return '\u0049'; // I
404
405                         case '\u01c5': // see ToLower()
406                                 return '\u01c4';
407                         case '\u01c8': // see ToLower()
408                                 return '\u01c7';
409                         case '\u01cb': // see ToLower()
410                                 return '\u01ca';
411                         case '\u01f2': // see ToLower()
412                                 return '\u01f1';
413                         case '\u0390': // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
414                                 return '\u03aa'; // it is not in ICU
415                         case '\u03b0': // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
416                                 return '\u03ab'; // it is not in ICU
417                         case '\u03d0': // GREEK BETA
418                                 return '\u0392';
419                         case '\u03d1': // GREEK THETA
420                                 return '\u0398';
421                         case '\u03d5': // GREEK PHI
422                                 return '\u03a6';
423                         case '\u03d6': // GREEK PI
424                                 return '\u03a0';
425                         case '\u03f0': // GREEK KAPPA
426                                 return '\u039a';
427                         case '\u03f1': // GREEK RHO
428                                 return '\u03a1';
429                         // am not sure why miscellaneous GREEK symbols are 
430                         // not handled here.
431                         }
432
433                         return ToUpperInvariant (c);
434                 }
435
436                 private char ToTitleCase (char c)
437                 {
438                         // Handle some Latin characters.
439                         switch (c) {
440                         case '\u01c4':
441                         case '\u01c5':
442                         case '\u01c6':
443                                 return '\u01c5';
444                         case '\u01c7':
445                         case '\u01c8':
446                         case '\u01c9':
447                                 return '\u01c8';
448                         case '\u01ca':
449                         case '\u01cb':
450                         case '\u01cc':
451                                 return '\u01cb';
452                         case '\u01f1':
453                         case '\u01f2':
454                         case '\u01f3':
455                                 return '\u01f2';
456                         }
457                         if ('\u2170' <= c && c <= '\u217f' || // Roman numbers
458                                 '\u24d0' <= c && c <= '\u24e9')
459                                 return c;
460                         return ToUpper (c);
461                 }
462
463                 public unsafe virtual string ToLower (string str)
464                 {
465                         // In ICU (3.2) there are a few cases that one single
466                         // character results in multiple characters in e.g.
467                         // tr-TR culture. So I tried brute force conversion
468                         // test with single character as a string input, but 
469                         // there was no such conversion. So I think it just
470                         // invokes ToLower(char).
471                         if (str == null)
472                                 throw new ArgumentNullException ("str");
473
474                         if (str.Length == 0)
475                                 return String.Empty;
476
477                         string tmp = String.FastAllocateString (str.Length);
478                         fixed (char* source = str, dest = tmp) {
479
480                                 char* destPtr = (char*)dest;
481                                 char* sourcePtr = (char*)source;
482
483                                 for (int n = 0; n < str.Length; n++) {
484                                         *destPtr = ToLower (*sourcePtr);
485                                         sourcePtr++;
486                                         destPtr++;
487                                 }
488                         }
489                         return tmp;
490                 }
491
492                 public unsafe virtual string ToUpper (string str)
493                 {
494                         // In ICU (3.2) there is a case that string
495                         // is handled beyond per-character conversion, but
496                         // it is only lt-LT culture where MS.NET does not
497                         // handle any special transliteration. So I keep
498                         // ToUpper() just as character conversion.
499                         if (str == null)
500                                 throw new ArgumentNullException ("str");
501
502                         if (str.Length == 0)
503                                 return String.Empty;
504
505                         string tmp = String.FastAllocateString (str.Length);
506                         fixed (char* source = str, dest = tmp) {
507
508                                 char* destPtr = (char*)dest;
509                                 char* sourcePtr = (char*)source;
510
511                                 for (int n = 0; n < str.Length; n++) {
512                                         *destPtr = ToUpper (*sourcePtr);
513                                         sourcePtr++;
514                                         destPtr++;
515                                 }
516                         }
517                         return tmp;
518                 }
519
520                 [ComVisible (false)]
521                 public static TextInfo ReadOnly (TextInfo textInfo)
522                 {
523                         if (textInfo == null)
524                                 throw new ArgumentNullException ("textInfo");
525
526                         TextInfo ti = new TextInfo (textInfo);
527                         ti.m_isReadOnly = true;
528                         return ti;
529                 }
530
531                 /* IDeserialization interface */
532                 [MonoTODO]
533                 void IDeserializationCallback.OnDeserialization(object sender)
534                 {
535                         // FIXME: we need to re-create "data" in order to get most properties working
536                 }
537
538                 /* IClonable */
539                 [ComVisible (false)]
540                 public virtual object Clone ()
541                 {
542                         return new TextInfo (this);
543                 }
544
545                 internal int GetCaseInsensitiveHashCode (string str)
546                 {
547                         return StringComparer.CurrentCultureIgnoreCase.GetHashCode (str);
548                 }
549
550                 internal static unsafe int GetHashCodeOrdinalIgnoreCase (string s)
551                 {
552                         var length = s.Length;
553                         fixed (char * c = s) {
554                                 char * cc = c;
555                                 char * end = cc + length - 1;
556                                 int h = 0;
557                                 for (;cc < end; cc += 2) {
558                                         h = (h << 5) - h + Char.ToUpperInvariant (*cc);
559                                         h = (h << 5) - h + Char.ToUpperInvariant (cc [1]);
560                                 }
561                                 ++end;
562                                 if (cc < end)
563                                         h = (h << 5) - h + Char.ToUpperInvariant (*cc);
564                                 return h;
565                         }
566                 }
567
568                 internal static unsafe int CompareOrdinalIgnoreCase(String str1, String str2)
569                 {
570                         return CompareOrdinalIgnoreCaseEx (str1, 0, str2, 0, str1.Length, str2.Length);
571                 }
572
573                 internal static int CompareOrdinalIgnoreCaseEx (String strA, int indexA, String strB, int indexB, int lenA, int lenB)
574                 {
575                         return CompareOrdinalCaseInsensitiveUnchecked (strA, indexA, lenA, strB, indexB, lenB);
576                 }
577
578                 static unsafe int CompareOrdinalCaseInsensitiveUnchecked (String strA, int indexA, int lenA, String strB, int indexB, int lenB)
579                 {
580                         if (strA == null) {
581                                 return strB == null ? 0 : -1;
582                         }
583                         if (strB == null) {
584                                 return 1;
585                         }
586                         int lengthA = Math.Min (lenA, strA.Length - indexA);
587                         int lengthB = Math.Min (lenB, strB.Length - indexB);
588
589                         if (lengthA == lengthB && Object.ReferenceEquals (strA, strB))
590                                 return 0;
591
592                         fixed (char* aptr = strA, bptr = strB) {
593                                 char* ap = aptr + indexA;
594                                 char* end = ap + Math.Min (lengthA, lengthB);
595                                 char* bp = bptr + indexB;
596                                 while (ap < end) {
597                                         if (*ap != *bp) {
598                                                 char c1 = Char.ToUpperInvariant (*ap);
599                                                 char c2 = Char.ToUpperInvariant (*bp);
600                                                 if (c1 != c2)
601                                                         return c1 - c2;
602                                         }
603                                         ap++;
604                                         bp++;
605                                 }
606                                 return lengthA - lengthB;
607                         }
608                 }
609
610                 internal static unsafe int LastIndexOfStringOrdinalIgnoreCase(String source, String value, int startIndex, int count)
611                 {
612                         int valueLen = value.Length;
613                         if (count < valueLen)
614                                 return -1;
615
616                         if (valueLen == 0)
617                                 return startIndex;
618
619                         fixed (char* thisptr = source, valueptr = value) {
620                                 char* ap = thisptr + startIndex - valueLen + 1;
621                                 char* thisEnd = ap - count + valueLen - 1;
622                                 while (ap != thisEnd) {
623                                         for (int i = 0; i < valueLen; i++) {
624                                                 if (Char.ToUpperInvariant (ap[i]) != Char.ToUpperInvariant (valueptr[i]))
625                                                         goto NextVal;
626                                         }
627                                         return (int)(ap - thisptr);
628                                         NextVal:
629                                         ap--;
630                                 }
631                         }
632                         return -1;
633                 }
634
635                 internal static int IndexOfStringOrdinalIgnoreCase(String source, String value, int startIndex, int count)
636                 {
637             Contract.Assert(source != null, "[TextInfo.IndexOfStringOrdinalIgnoreCase] Caller should've validated source != null");
638             Contract.Assert(value != null, "[TextInfo.IndexOfStringOrdinalIgnoreCase] Caller should've validated value != null");
639             Contract.Assert(startIndex + count <= source.Length, "[TextInfo.IndexOfStringOrdinalIgnoreCase] Caller should've validated startIndex + count <= source.Length");
640
641             // We return 0 if both inputs are empty strings
642             if (source.Length == 0 && value.Length == 0)
643             {
644                 return 0;
645             }
646
647             // the search space within [source] starts at offset [startIndex] inclusive and includes
648             // [count] characters (thus the last included character is at index [startIndex + count -1]
649             // [end] is the index of the next character after the search space
650             // (it points past the end of the search space)
651             int end = startIndex + count;
652             
653             // maxStartIndex is the index beyond which we never *start* searching, inclusive; in other words;
654             // a search could include characters beyond maxStartIndex, but we'd never begin a search at an 
655             // index strictly greater than maxStartIndex. 
656             int maxStartIndex = end - value.Length;
657
658             for (; startIndex <= maxStartIndex; startIndex++)
659             {
660                 // We should always have the same or more characters left to search than our actual pattern
661                 Contract.Assert(end - startIndex >= value.Length);
662                 // since this is an ordinal comparison, we can assume that the lengths must match
663                 if (CompareOrdinalIgnoreCaseEx(source, startIndex, value, 0, value.Length, value.Length) == 0)
664                 {
665                     return startIndex;
666                 }
667             }
668             
669             // Not found
670             return -1;
671                 }
672         }
673 }