6 /// http://www.w3.org/TR/REC-xml
\r
9 /// Includes code and Ideas from org.apache.xerces.util.XMLChar class of Xerces 2.0.1
\r
10 /// However, No surrogate support is included in this class.
\r
11 /// This class is currently public. Make it internal after testing completes
\r
13 public class XmlConstructs
\r
15 /** Character flags. */
\r
16 internal static byte[] CHARS = new byte[1 << 16];
\r
18 /** Valid character mask. */
\r
19 internal static int VALID = 0x01;
\r
21 /** Space character mask. */
\r
22 internal static int SPACE = 0x02;
\r
24 /** Name start character mask. */
\r
25 internal static int NAME_START = 0x04;
\r
27 /** Name character mask. */
\r
28 internal static int NAME = 0x08;
\r
30 /** Pubid character mask. */
\r
31 internal static int PUBID = 0x10;
\r
33 * Content character mask. Special characters are those that can
\r
34 * be considered the start of markup, such as '<' and '&'.
\r
35 * The various newline characters are considered special as well.
\r
36 * All other valid XML characters can be considered content.
\r
38 * This is an optimization for the inner loop of character scanning.
\r
40 internal static int CONTENT = 0x20;
\r
42 /** NCName start character mask. */
\r
43 internal static int NCNAME_START = 0x40;
\r
45 /** NCName character mask. */
\r
46 internal static int NCNAME = 0x80;
\r
48 static XmlConstructs()
\r
51 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] |
\r
52 // [#xE000-#xFFFD] | [#x10000-#x10FFFF]
\r
54 int[] charRange = {
\r
55 0x0009, 0x000A, 0x000D, 0x000D, 0x0020, 0xD7FF, 0xE000, 0xFFFD,
\r
59 // [3] S ::= (#x20 | #x9 | #xD | #xA)+
\r
61 int[] spaceChar = {
\r
62 0x0020, 0x0009, 0x000D, 0x000A,
\r
66 // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
\r
67 // CombiningChar | Extender
\r
70 0x002D, 0x002E, // '-' and '.'
\r
74 // [5] Name ::= (Letter | '_' | ':') (NameChar)*
\r
77 int[] nameStartChar = {
\r
78 0x003A, 0x005F, // ':' and '_'
\r
82 // [13] PubidChar ::= #x20 | 0xD | 0xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
\r
86 0x000A, 0x000D, 0x0020, 0x0021, 0x0023, 0x0024, 0x0025, 0x003D,
\r
90 int[] pubidRange = {
\r
91 0x0027, 0x003B, 0x003F, 0x005A, 0x0061, 0x007A
\r
95 // [84] Letter ::= BaseChar | Ideographic
\r
98 int[] letterRange = {
\r
100 0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
\r
101 0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
\r
102 0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
\r
103 0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
\r
104 0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
\r
105 0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
\r
106 0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
\r
107 0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
\r
108 0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
\r
109 0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
\r
110 0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
\r
111 0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
\r
112 0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
\r
113 0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
\r
114 0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
\r
115 0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
\r
116 0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
\r
117 0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
\r
118 0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
\r
119 0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
\r
120 0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
\r
121 0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
\r
122 0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
\r
123 0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
\r
124 0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
\r
125 0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
\r
126 0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
\r
127 0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
\r
128 0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
\r
129 0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
\r
130 0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
\r
131 0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
\r
132 0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
\r
133 0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
\r
134 0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
\r
135 0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
\r
136 0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
\r
139 0x3021, 0x3029, 0x4E00, 0x9FA5,
\r
141 int[] letterChar = {
\r
143 0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
\r
144 0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
\r
145 0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
\r
146 0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
\r
147 0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
\r
148 0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
\r
149 0x1F5D, 0x1FBE, 0x2126, 0x212E,
\r
155 // [87] CombiningChar ::= ...
\r
158 int[] combiningCharRange = {
\r
159 0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
\r
160 0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
\r
161 0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
\r
162 0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
\r
163 0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
\r
164 0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
\r
165 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
\r
166 0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
\r
167 0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
\r
168 0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
\r
169 0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
\r
170 0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
\r
171 0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
\r
172 0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
\r
173 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
\r
174 0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
\r
175 0x20D0, 0x20DC, 0x302A, 0x302F,
\r
178 int[] combiningCharChar = {
\r
179 0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
\r
180 0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
\r
181 0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
\r
182 0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
\r
186 // [88] Digit ::= ...
\r
189 int[] digitRange = {
\r
190 0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
\r
191 0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
\r
192 0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
\r
193 0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
\r
197 // [89] Extender ::= ...
\r
200 int[] extenderRange = {
\r
201 0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
\r
204 int[] extenderChar = {
\r
205 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
\r
209 // SpecialChar ::= '<', '&', '\n', '\r', ']'
\r
212 int[] specialChar = {
\r
213 '<', '&', '\n', '\r', ']',
\r
220 // set valid characters
\r
221 for (int i = 0; i < charRange.Length; i += 2)
\r
223 for (int j = charRange[i]; j <= charRange[i + 1]; j++)
\r
225 CHARS[j] = (byte)(CHARS[j] | VALID | CONTENT);
\r
229 // remove special characters
\r
230 for (int i = 0; i < specialChar.Length; i++)
\r
232 CHARS[specialChar[i]] = (byte)(CHARS[specialChar[i]] & ~CONTENT);
\r
235 // set space characters
\r
236 for (int i = 0; i < spaceChar.Length; i++)
\r
238 CHARS[spaceChar[i]] = (byte)(CHARS[spaceChar[i]] | SPACE);
\r
241 // set name start characters
\r
242 for (int i = 0; i < nameStartChar.Length; i++)
\r
244 CHARS[nameStartChar[i]] = (byte)(CHARS[nameStartChar[i]] | NAME_START | NAME |
\r
245 NCNAME_START | NCNAME);
\r
247 for (int i = 0; i < letterRange.Length; i += 2)
\r
249 for (int j = letterRange[i]; j <= letterRange[i + 1]; j++)
\r
251 CHARS[j] = (byte)(CHARS[j] | NAME_START | NAME | NCNAME_START | NCNAME);
\r
254 for (int i = 0; i < letterChar.Length; i++)
\r
256 CHARS[letterChar[i]] = (byte)(CHARS[letterChar[i]] | NAME_START | NAME |
\r
257 NCNAME_START | NCNAME);
\r
260 // set name characters
\r
261 for (int i = 0; i < nameChar.Length; i++)
\r
263 CHARS[nameChar[i]] = (byte)(CHARS[nameChar[i]] | NAME | NCNAME);
\r
265 for (int i = 0; i < digitRange.Length; i += 2)
\r
267 for (int j = digitRange[i]; j <= digitRange[i + 1]; j++)
\r
269 CHARS[j] = (byte)(CHARS[j] | NAME | NCNAME);
\r
272 for (int i = 0; i < combiningCharRange.Length; i += 2)
\r
274 for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++)
\r
276 CHARS[j] = (byte)(CHARS[j] | NAME | NCNAME);
\r
279 for (int i = 0; i < combiningCharChar.Length; i++)
\r
281 CHARS[combiningCharChar[i]] = (byte)(CHARS[combiningCharChar[i]] | NAME | NCNAME);
\r
283 for (int i = 0; i < extenderRange.Length; i += 2)
\r
285 for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++)
\r
287 CHARS[j] = (byte)(CHARS[j] | NAME | NCNAME);
\r
290 for (int i = 0; i < extenderChar.Length; i++)
\r
292 CHARS[extenderChar[i]] = (byte)(CHARS[extenderChar[i]] | NAME | NCNAME);
\r
295 // remove ':' from allowable NCNAME_START and NCNAME chars
\r
296 CHARS[':'] = (byte)(CHARS[':'] & ~(NCNAME_START | NCNAME));
\r
298 // set Pubid characters
\r
299 for (int i = 0; i < pubidChar.Length; i++)
\r
301 CHARS[pubidChar[i]] = (byte)(CHARS[pubidChar[i]] | PUBID);
\r
303 for (int i = 0; i < pubidRange.Length; i += 2)
\r
305 for (int j = pubidRange[i]; j <= pubidRange[i + 1]; j++)
\r
307 CHARS[j] = (byte)(CHARS[j] | PUBID);
\r
312 private XmlConstructs()
\r
320 /// Returns true if the specified character is valid.
\r
322 /// <param name="c">The character to check.</param>
\r
323 public static bool IsValid(char c)
\r
325 return (CHARS[c] & VALID) != 0;
\r
329 /// Returns true if the specified character is invalid.
\r
331 /// <param name="c">The character to check.</param>
\r
332 public static bool IsInvalid(char c)
\r
334 return !IsValid(c);
\r
338 /// Returns true if the specified character can be considered content.
\r
340 /// <param name="c">The character to check.</param>
\r
341 public static bool IsContent(char c)
\r
343 return (CHARS[c] & CONTENT) != 0;
\r
347 /// Returns true if the specified character can be considered markup.
\r
348 /// Markup characters include '<', '&', and '%'.
\r
350 /// <param name="c">The character to check.</param>
\r
351 public static bool IsMarkup(char c)
\r
353 return c == '<' || c == '&' || c == '%';
\r
357 /// Returns true if the specified character is a space character
\r
358 /// as defined by production [3] in the XML 1.0 specification.
\r
360 /// <param name="c">The character to check.</param>
\r
361 /// <returns></returns>
\r
362 public static bool IsSpace(char c)
\r
364 return (CHARS[c] & SPACE) != 0;
\r
367 public static bool IsSpace(int c)
\r
369 return c > 0 && (CHARS[c] & SPACE) != 0;
\r
373 /// Returns true if the specified character is a valid name start
\r
374 /// character as defined by production [5] in the XML 1.0 specification.
\r
376 /// <param name="c">The character to check.</param>
\r
377 public static bool IsNameStart(char c)
\r
379 return (CHARS[c] & NAME_START) != 0;
\r
382 public static bool IsNameStart(int c)
\r
384 return c > 0 && (CHARS[c] & NAME_START) != 0;
\r
388 /// Returns true if the specified character is a valid name
\r
389 /// character as defined by production [4] in the XML 1.0 specification.
\r
391 /// <param name="c">The character to check.</param>
\r
392 public static bool IsName(char c)
\r
394 return (CHARS[c] & NAME) != 0;
\r
397 public static bool IsName(int c)
\r
399 return c > 0 && (CHARS[c] & NAME) != 0;
\r
403 /// Returns true if the specified character is a valid NCName start
\r
404 /// character as defined by production [4] in Namespaces in XML
\r
405 /// recommendation.
\r
407 /// <param name="c">The character to check.</param>
\r
408 /// <returns></returns>
\r
409 public static bool IsNCNameStart(char c)
\r
411 return (CHARS[c] & NCNAME_START) != 0;
\r
415 /// Returns true if the specified character is a valid NCName
\r
416 /// character as defined by production [5] in Namespaces in XML
\r
417 /// recommendation.
\r
419 /// <param name="c"></param>
\r
420 /// <returns></returns>
\r
421 public static bool IsNCName(char c)
\r
423 return (CHARS[c] & NCNAME) != 0;
\r
426 public static bool IsNCName(int c)
\r
428 return c > 0 && (CHARS[c] & NCNAME) != 0;
\r
432 /// Returns true if the specified character is a valid Pubid
\r
433 /// character as defined by production [13] in the XML 1.0 specification.
\r
435 /// <param name="c">The character to check</param>
\r
436 public static bool IsPubid(char c)
\r
438 return (CHARS[c] & PUBID) != 0;
\r
441 public static bool IsPubid(int c)
\r
443 return c > 0 && (CHARS[c] & PUBID) != 0;
\r
447 /// Check to see if a string is a valid Name according to [5]
\r
448 /// in the XML 1.0 Recommendation
\r
450 /// <param name="name">The string to check</param>
\r
451 public static bool IsValidName(String name, out Exception err)
\r
454 if (name.Length == 0)
\r
456 err = new XmlException("Name can not be an empty string",null);
\r
460 if( IsNameStart(ch) == false)
\r
462 err = new XmlException("The character '"+ch+"' cannot start a Name",null);
\r
465 for (int i = 1; i < name.Length; i++ )
\r
468 if( IsName( ch ) == false )
\r
470 err = new XmlException("The character '"+ch+"' is not allowed in a Name",null);
\r
478 /// Check to see if a string is a valid NCName according to [4]
\r
479 /// from the XML Namespaces 1.0 Recommendation
\r
481 /// <param name="ncName">The string to check</param>
\r
482 public static bool IsValidNCName(String ncName, out Exception err)
\r
485 if (ncName.Length == 0)
\r
487 err = new XmlException("NCName can not be an empty string",null);
\r
490 char ch = ncName[0];
\r
491 if( IsNCNameStart(ch) == false)
\r
493 err = new XmlException("The character '"+ch+"' cannot start a NCName",null);
\r
496 for (int i = 1; i < ncName.Length; i++ )
\r
499 if( IsNCName( ch ) == false )
\r
501 err = new XmlException("The character '"+ch+"' is not allowed in a NCName",null);
\r
509 /// Check to see if a string is a valid Nmtoken according to [7]
\r
510 /// in the XML 1.0 Recommendation
\r
512 /// <param name="nmtoken">The string to check.</param>
\r
513 public static bool IsValidNmtoken(String nmtoken, out Exception err)
\r
516 if (nmtoken.Length == 0)
\r
518 err = new XmlException("NMTOKEN can not be an empty string", null);
\r
521 for (int i = 0; i < nmtoken.Length; i++ )
\r
523 char ch = nmtoken[i];
\r
524 if( ! IsName( ch ) )
\r
526 err = new XmlException("The character '"+ch+"' is not allowed in a NMTOKEN",null);
\r
536 /// Returns true if the encoding name is a valid IANA encoding.
\r
537 /// This method does not verify that there is a decoder available
\r
538 /// for this encoding, only that the characters are valid for an
\r
539 /// IANA encoding name.
\r
541 /// <param name="ianaEncoding">The encoding to check.</param>
\r
542 /// <returns></returns>
\r
543 public static bool IsValidIANAEncoding(String ianaEncoding)
\r
545 if (ianaEncoding != null)
\r
547 int length = ianaEncoding.Length;
\r
550 char c = ianaEncoding[0];
\r
551 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
\r
553 for (int i = 1; i < length; i++)
\r
555 c = ianaEncoding[i];
\r
556 if ((c < 'A' || c > 'Z') && (c < 'a' || c > 'z') &&
\r
557 (c < '0' || c > '9') && c != '.' && c != '_' &&
\r