3 // Permission is hereby granted, free of charge, to any person obtaining
4 // a copy of this software and associated documentation files (the
5 // "Software"), to deal in the Software without restriction, including
6 // without limitation the rights to use, copy, modify, merge, publish,
7 // distribute, sublicense, and/or sell copies of the Software, and to
8 // permit persons to whom the Software is furnished to do so, subject to
9 // the following conditions:
11 // The above copyright notice and this permission notice shall be
12 // included in all copies or substantial portions of the Software.
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 /// http://www.w3.org/TR/REC-xml
30 /// Includes code and Ideas from org.apache.xerces.util.XMLChar class of Xerces 2.0.1
31 /// However, No surrogate support is included in this class.
32 /// This class is currently public. Make it internal after testing completes
34 internal class XmlConstructs
36 internal static readonly char [] WhitespaceChars = {' ', '\n', '\t', '\r'};
38 /** Character flags. */
39 internal static readonly byte [] CHARS = new byte [1 << 16];
41 /** Valid character mask. */
42 internal const int VALID = 0x01;
44 /** Space character mask. */
45 internal const int SPACE = 0x02;
47 /** Name start character mask. */
48 internal const int NAME_START = 0x04;
50 /** Name character mask. */
51 internal const int NAME = 0x08;
53 /** Pubid character mask. */
54 internal const int PUBID = 0x10;
56 * Content character mask. Special characters are those that can
57 * be considered the start of markup, such as '<' and '&'.
58 * The various newline characters are considered special as well.
59 * All other valid XML characters can be considered content.
61 * This is an optimization for the inner loop of character scanning.
63 internal const int CONTENT = 0x20;
65 /** NCName start character mask. */
66 internal const int NCNAME_START = 0x40;
68 /** NCName character mask. */
69 internal const int NCNAME = 0x80;
71 static XmlConstructs ()
74 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
75 // [#xE000-#xFFFD] | [#x10000-#x10FFFF]
78 0x0009, 0x000A, 0x000D, 0x000D, 0x0020, 0xD7FF, 0xE000, 0xFFFD,
82 // [3] S ::= (#x20 | #x9 | #xD | #xA)+
85 0x0020, 0x0009, 0x000D, 0x000A,
89 // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
90 // CombiningChar | Extender
93 0x002D, 0x002E, // '-' and '.'
97 // [5] Name ::= (Letter | '_' | ':') (NameChar)*
100 int[] nameStartChar = {
101 0x003A, 0x005F, // ':' and '_'
105 // [13] PubidChar ::= #x20 | 0xD | 0xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
109 0x000A, 0x000D, 0x0020, 0x0021, 0x0023, 0x0024, 0x0025, 0x003D,
114 0x0027, 0x003B, 0x003F, 0x005A, 0x0061, 0x007A
118 // [84] Letter ::= BaseChar | Ideographic
121 int[] letterRange = {
123 0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
124 0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
125 0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
126 0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
127 0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
128 0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
129 0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
130 0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
131 0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
132 0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
133 0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
134 0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
135 0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
136 0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
137 0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
138 0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
139 0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
140 0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
141 0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
142 0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
143 0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
144 0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
145 0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
146 0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
147 0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
148 0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
149 0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
150 0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
151 0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
152 0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
153 0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
154 0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
155 0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
156 0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
157 0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
158 0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
159 0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
162 0x3021, 0x3029, 0x4E00, 0x9FA5,
166 0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
167 0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
168 0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
169 0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
170 0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
171 0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
172 0x1F5D, 0x1FBE, 0x2126, 0x212E,
178 // [87] CombiningChar ::= ...
181 int[] combiningCharRange = {
182 0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
183 0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
184 0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
185 0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
186 0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
187 0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
188 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
189 0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
190 0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
191 0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
192 0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
193 0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
194 0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
195 0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
196 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
197 0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
198 0x20D0, 0x20DC, 0x302A, 0x302F,
201 int[] combiningCharChar = {
202 0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
203 0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
204 0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
205 0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
209 // [88] Digit ::= ...
213 0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
214 0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
215 0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
216 0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
220 // [89] Extender ::= ...
223 int[] extenderRange = {
224 0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
227 int[] extenderChar = {
228 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
232 // SpecialChar ::= '<', '&', '\n', '\r', ']'
235 int[] specialChar = {
236 '<', '&', '\n', '\r', ']',
243 // set valid characters
244 for (int i = 0; i < charRange.Length; i += 2)
246 for (int j = charRange[i]; j <= charRange[i + 1]; j++)
248 CHARS[j] = (byte)(CHARS[j] | VALID | CONTENT);
252 // remove special characters
253 for (int i = 0; i < specialChar.Length; i++)
255 CHARS[specialChar[i]] = (byte)(CHARS[specialChar[i]] & ~CONTENT);
258 // set space characters
259 for (int i = 0; i < spaceChar.Length; i++)
261 CHARS[spaceChar[i]] = (byte)(CHARS[spaceChar[i]] | SPACE);
264 // set name start characters
265 for (int i = 0; i < nameStartChar.Length; i++)
267 CHARS[nameStartChar[i]] = (byte)(CHARS[nameStartChar[i]] | NAME_START | NAME |
268 NCNAME_START | NCNAME);
270 for (int i = 0; i < letterRange.Length; i += 2)
272 for (int j = letterRange[i]; j <= letterRange[i + 1]; j++)
274 CHARS[j] = (byte)(CHARS[j] | NAME_START | NAME | NCNAME_START | NCNAME);
277 for (int i = 0; i < letterChar.Length; i++)
279 CHARS[letterChar[i]] = (byte)(CHARS[letterChar[i]] | NAME_START | NAME |
280 NCNAME_START | NCNAME);
283 // set name characters
284 for (int i = 0; i < nameChar.Length; i++)
286 CHARS[nameChar[i]] = (byte)(CHARS[nameChar[i]] | NAME | NCNAME);
288 for (int i = 0; i < digitRange.Length; i += 2)
290 for (int j = digitRange[i]; j <= digitRange[i + 1]; j++)
292 CHARS[j] = (byte)(CHARS[j] | NAME | NCNAME);
295 for (int i = 0; i < combiningCharRange.Length; i += 2)
297 for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++)
299 CHARS[j] = (byte)(CHARS[j] | NAME | NCNAME);
302 for (int i = 0; i < combiningCharChar.Length; i++)
304 CHARS[combiningCharChar[i]] = (byte)(CHARS[combiningCharChar[i]] | NAME | NCNAME);
306 for (int i = 0; i < extenderRange.Length; i += 2)
308 for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++)
310 CHARS[j] = (byte)(CHARS[j] | NAME | NCNAME);
313 for (int i = 0; i < extenderChar.Length; i++)
315 CHARS[extenderChar[i]] = (byte)(CHARS[extenderChar[i]] | NAME | NCNAME);
318 // remove ':' from allowable NCNAME_START and NCNAME chars
319 CHARS[':'] = (byte)(CHARS[':'] & ~(NCNAME_START | NCNAME));
321 // set Pubid characters
322 for (int i = 0; i < pubidChar.Length; i++)
324 CHARS[pubidChar[i]] = (byte)(CHARS[pubidChar[i]] | PUBID);
326 for (int i = 0; i < pubidRange.Length; i += 2)
328 for (int j = pubidRange[i]; j <= pubidRange[i + 1]; j++)
330 CHARS[j] = (byte)(CHARS[j] | PUBID);
338 /// Returns true if the specified character is valid.
340 /// <param name="c">The character to check.</param>
341 public static bool IsValid(char c)
343 return c > 0 && ((CHARS[c] & VALID) != 0);
346 public static bool IsValid (int c)
350 return c > 0 && ((CHARS[c] & VALID) != 0);
354 /// Returns true if the specified character is invalid.
356 /// <param name="c">The character to check.</param>
357 public static bool IsInvalid(char c)
362 public static bool IsInvalid(int c)
368 /// Returns true if the specified character can be considered content.
370 /// <param name="c">The character to check.</param>
371 public static bool IsContent(char c)
373 return (CHARS[c] & CONTENT) != 0;
376 public static bool IsContent(int c)
378 return c > 0 && c < CHARS.Length && (CHARS[c] & CONTENT) != 0;
382 /// Returns true if the specified character can be considered markup.
383 /// Markup characters include '<', '&', and '%'.
385 /// <param name="c">The character to check.</param>
386 public static bool IsMarkup(char c)
388 return c == '<' || c == '&' || c == '%';
391 public static bool IsMarkup(int c)
393 return c > 0 && c < CHARS.Length && (c == '<' || c == '&' || c == '%');
397 /// Returns true if the specified character is a space character
398 /// as defined by production [3] in the XML 1.0 specification.
400 /// <param name="c">The character to check.</param>
401 /// <returns></returns>
402 public static bool IsWhitespace (char c)
404 return (CHARS[c] & SPACE) != 0;
407 public static bool IsWhitespace (int c)
409 return c > 0 && c < CHARS.Length && (CHARS[c] & SPACE) != 0;
413 /// Returns true if the specified character is a valid name start
414 /// character as defined by production [5] in the XML 1.0 specification.
416 /// <param name="c">The character to check.</param>
417 public static bool IsFirstNameChar (char c)
419 return (CHARS[c] & NAME_START) != 0;
422 public static bool IsFirstNameChar (int c)
424 return c > 0 && c < CHARS.Length && (CHARS[c] & NAME_START) != 0;
428 /// Returns true if the specified character is a valid name
429 /// character as defined by production [4] in the XML 1.0 specification.
431 /// <param name="c">The character to check.</param>
432 public static bool IsNameChar(char c)
434 return (CHARS[c] & NAME) != 0;
437 public static bool IsNameChar(int c)
439 return c > 0 && c < CHARS.Length && (CHARS[c] & NAME) != 0;
443 /// Returns true if the specified character is a valid NCName start
444 /// character as defined by production [4] in Namespaces in XML
447 /// <param name="c">The character to check.</param>
448 /// <returns></returns>
449 public static bool IsNCNameStart(char c)
451 return (CHARS[c] & NCNAME_START) != 0;
454 public static bool IsNCNameStart(int c)
456 return c > 0 && c < CHARS.Length && (CHARS[c] & NCNAME_START) != 0;
460 /// Returns true if the specified character is a valid NCName
461 /// character as defined by production [5] in Namespaces in XML
464 /// <param name="c"></param>
465 /// <returns></returns>
466 public static bool IsNCNameChar(char c)
468 return (CHARS[c] & NCNAME) != 0;
471 public static bool IsNCNameChar(int c)
473 return c > 0 && c < CHARS.Length && (CHARS[c] & NCNAME) != 0;
477 /// Returns true if the specified character is a valid Pubid
478 /// character as defined by production [13] in the XML 1.0 specification.
480 /// <param name="c">The character to check</param>
481 public static bool IsPubidChar (char c)
483 return (CHARS[c] & PUBID) != 0;
486 public static bool IsPubidChar (int c)
488 return c > 0 && c < CHARS.Length && (CHARS[c] & PUBID) != 0;
492 /// Check to see if a string is a valid Name according to [5]
493 /// in the XML 1.0 Recommendation
495 /// <param name="name">The string to check</param>
496 public static bool IsValidName(String name, out Exception err)
499 if (name.Length == 0)
501 err = new XmlException("Name can not be an empty string",null);
505 if( IsFirstNameChar (ch) == false)
507 err = new XmlException("The character '"+ch+"' cannot start a Name",null);
510 for (int i = 1; i < name.Length; i++ )
513 if( IsNameChar (ch) == false )
515 err = new XmlException("The character '"+ch+"' is not allowed in a Name",null);
522 public static int IsValidName (string name)
524 if (name.Length == 0)
526 if (!IsFirstNameChar (name [0]))
528 for (int i=1; i<name.Length; i++)
529 if (!IsNameChar (name [i]))
535 /// Check to see if a string is a valid NCName according to [4]
536 /// from the XML Namespaces 1.0 Recommendation
538 /// <param name="ncName">The string to check</param>
539 public static bool IsValidNCName(String ncName, out Exception err)
542 if (ncName.Length == 0)
544 err = new XmlException("NCName can not be an empty string",null);
548 if( IsNCNameStart(ch) == false)
550 err = new XmlException("The character '"+ch+"' cannot start a NCName",null);
553 for (int i = 1; i < ncName.Length; i++ )
556 if( IsNCNameChar (ch) == false )
558 err = new XmlException("The character '"+ch+"' is not allowed in a NCName",null);
566 /// Check to see if a string is a valid Nmtoken according to [7]
567 /// in the XML 1.0 Recommendation
569 /// <param name="nmtoken">The string to check.</param>
570 public static bool IsValidNmtoken(String nmtoken, out Exception err)
573 if (nmtoken.Length == 0)
575 err = new XmlException("NMTOKEN can not be an empty string", null);
578 for (int i = 0; i < nmtoken.Length; i++ )
580 char ch = nmtoken[i];
581 if( ! IsNameChar (ch) )
583 err = new XmlException("The character '"+ch+"' is not allowed in a NMTOKEN",null);
593 /// Returns true if the encoding name is a valid IANA encoding.
594 /// This method does not verify that there is a decoder available
595 /// for this encoding, only that the characters are valid for an
596 /// IANA encoding name.
598 /// <param name="ianaEncoding">The encoding to check.</param>
599 /// <returns></returns>
600 public static bool IsValidIANAEncoding(String ianaEncoding)
602 if (ianaEncoding != null)
604 int length = ianaEncoding.Length;
607 char c = ianaEncoding[0];
608 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
610 for (int i = 1; i < length; i++)
613 if ((c < 'A' || c > 'Z') && (c < 'a' || c > 'z') &&
614 (c < '0' || c > '9') && c != '.' && c != '_' &&
627 public static bool IsName (string str)
631 if (!IsFirstNameChar (str [0]))
633 for (int i = 1; i < str.Length; i++)
634 if (!IsNameChar (str [i]))
639 public static bool IsNCName (string str)
643 if (!IsFirstNameChar (str [0]))
645 for (int i = 0; i < str.Length; i++)
646 if (!IsNCNameChar (str [i]))
651 public static bool IsNmToken (string str)
655 for (int i = 0; i < str.Length; i++)
656 if (!IsNameChar (str [i]))
661 public static bool IsWhitespace (string str)
663 for (int i = 0; i < str.Length; i++)
664 if (!IsWhitespace (str [i])) return false;
669 public static int GetPredefinedEntity (string name)