1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 package org.dom4j.io.aelfred;
32
33 import java.io.BufferedInputStream;
34 import java.io.CharConversionException;
35 import java.io.EOFException;
36 import java.io.IOException;
37 import java.io.InputStream;
38 import java.io.InputStreamReader;
39 import java.io.Reader;
40 import java.net.URL;
41 import java.net.URLConnection;
42 import java.util.ArrayList;
43 import java.util.HashMap;
44 import java.util.Iterator;
45
46 import org.xml.sax.SAXException;
47
48
49
50
51 /***
52 * Parse XML documents and return parse events through call-backs.
53 * Use the <code>SAXDriver</code> class as your entry point, as the
54 * internal parser interfaces are subject to change.
55 *
56 * @author Written by David Megginson <dmeggins@microstar.com>
57 * (version 1.2a with bugfixes)
58 * @author Updated by David Brownell <david-b@pacbell.net>
59 * @version $Date: 2002/05/24 14:41:55 $
60 * @see SAXDriver
61 * @deprecated Use Aelfred2 instead! THIS CLASS WILL BE REMOVED IN dom4j-1.6 !!
62 */
63 final class XmlParser
64 {
65
66
67
68
69 private final static boolean USE_CHEATS = true;
70
71
72
73
74
75
76
77 /***
78 * Construct a new parser with no associated handler.
79 * @see #setHandler
80 * @see #parse
81 */
82
83 XmlParser ()
84 {
85 cleanupVariables ();
86 }
87
88
89 /***
90 * Set the handler that will receive parsing events.
91 * @param handler The handler to receive callback events.
92 * @see #parse
93 */
94
95 void setHandler (SAXDriver handler)
96 {
97 this.handler = handler;
98 }
99
100
101 /***
102 * Parse an XML document from the character stream, byte stream, or URI
103 * that you provide (in that order of preference). Any URI that you
104 * supply will become the base URI for resolving relative URI, and may
105 * be used to acquire a reader or byte stream.
106 *
107 * <p>You may parse more than one document, but that must be done
108 * sequentially. Only one thread at a time may use this parser.
109 *
110 * @param systemId The URI of the document; should never be null,
111 * but may be so iff a reader <em>or</em> a stream is provided.
112 * @param publicId The public identifier of the document, or null.
113 * @param reader A character stream; must be null if stream isn't.
114 * @param stream A byte input stream; must be null if reader isn't.
115 * @param encoding The suggested encoding, or null if unknown.
116 * @exception java.lang.Exception Basically SAXException or IOException
117 */
118
119 void doParse (
120 String systemId,
121 String publicId,
122 Reader reader,
123 InputStream stream,
124 String encoding
125 ) throws Exception
126 {
127 if (handler == null)
128 throw new IllegalStateException ("no callback handler");
129
130 basePublicId = publicId;
131 baseURI = systemId;
132 baseReader = reader;
133 baseInputStream = stream;
134
135 initializeVariables ();
136
137
138
139
140 setInternalEntity ("amp", "&");
141 setInternalEntity ("lt", "<");
142 setInternalEntity ("gt", ">");
143 setInternalEntity ("apos", "'");
144 setInternalEntity ("quot", """);
145
146 handler.startDocument ();
147
148 pushURL ("[document]", basePublicId, baseURI,
149 baseReader, baseInputStream, encoding);
150
151 try {
152 parseDocument ();
153 handler.endDocument ();
154 } finally {
155 if (baseReader != null)
156 try { baseReader.close ();
157 } catch (IOException e) {
158 if (baseInputStream != null)
159 try { baseInputStream.close ();
160 } catch (IOException e) {
161 if (is != null)
162 try { is.close ();
163 } catch (IOException e) {
164 if (reader != null)
165 try {
166 reader.close ();
167 } catch (IOException e) {
168 }
169 cleanupVariables ();
170 }
171 }
172
173
174
175
176
177
178
179
180
181
182 /***
183 * Constant: an element has not been declared.
184 * @see #getElementContentType
185 */
186 public final static int CONTENT_UNDECLARED = 0;
187
188 /***
189 * Constant: the element has a content model of ANY.
190 * @see #getElementContentType
191 */
192 public final static int CONTENT_ANY = 1;
193
194 /***
195 * Constant: the element has declared content of EMPTY.
196 * @see #getElementContentType
197 */
198 public final static int CONTENT_EMPTY = 2;
199
200 /***
201 * Constant: the element has mixed content.
202 * @see #getElementContentType
203 */
204 public final static int CONTENT_MIXED = 3;
205
206 /***
207 * Constant: the element has element content.
208 * @see #getElementContentType
209 */
210 public final static int CONTENT_ELEMENTS = 4;
211
212
213
214
215
216
217 /***
218 * Constant: the entity has not been declared.
219 * @see #getEntityType
220 */
221 public final static int ENTITY_UNDECLARED = 0;
222
223 /***
224 * Constant: the entity is internal.
225 * @see #getEntityType
226 */
227 public final static int ENTITY_INTERNAL = 1;
228
229 /***
230 * Constant: the entity is external, non-XML data.
231 * @see #getEntityType
232 */
233 public final static int ENTITY_NDATA = 2;
234
235 /***
236 * Constant: the entity is external XML data.
237 * @see #getEntityType
238 */
239 public final static int ENTITY_TEXT = 3;
240
241
242
243
244
245
246 /***
247 * Constant: the attribute has not been declared for this element type.
248 * @see #getAttributeType
249 */
250 public final static int ATTRIBUTE_UNDECLARED = 0;
251
252 /***
253 * Constant: the attribute value is a string value.
254 * @see #getAttributeType
255 */
256 public final static int ATTRIBUTE_CDATA = 1;
257
258 /***
259 * Constant: the attribute value is a unique identifier.
260 * @see #getAttributeType
261 */
262 public final static int ATTRIBUTE_ID = 2;
263
264 /***
265 * Constant: the attribute value is a reference to a unique identifier.
266 * @see #getAttributeType
267 */
268 public final static int ATTRIBUTE_IDREF = 3;
269
270 /***
271 * Constant: the attribute value is a list of ID references.
272 * @see #getAttributeType
273 */
274 public final static int ATTRIBUTE_IDREFS = 4;
275
276 /***
277 * Constant: the attribute value is the name of an entity.
278 * @see #getAttributeType
279 */
280 public final static int ATTRIBUTE_ENTITY = 5;
281
282 /***
283 * Constant: the attribute value is a list of entity names.
284 * @see #getAttributeType
285 */
286 public final static int ATTRIBUTE_ENTITIES = 6;
287
288 /***
289 * Constant: the attribute value is a name token.
290 * @see #getAttributeType
291 */
292 public final static int ATTRIBUTE_NMTOKEN = 7;
293
294 /***
295 * Constant: the attribute value is a list of name tokens.
296 * @see #getAttributeType
297 */
298 public final static int ATTRIBUTE_NMTOKENS = 8;
299
300 /***
301 * Constant: the attribute value is a token from an enumeration.
302 * @see #getAttributeType
303 */
304 public final static int ATTRIBUTE_ENUMERATED = 9;
305
306 /***
307 * Constant: the attribute is the name of a notation.
308 * @see #getAttributeType
309 */
310 public final static int ATTRIBUTE_NOTATION = 10;
311
312
313
314
315
316
317
318 /***
319 * Hash table of attribute types.
320 */
321 private static HashMap attributeTypeHash;
322 static {
323 attributeTypeHash = new HashMap (13);
324 attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA));
325 attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID));
326 attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF));
327 attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS));
328 attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY));
329 attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES));
330 attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN));
331 attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS));
332 attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION));
333 }
334
335
336
337
338
339 private final static int ENCODING_EXTERNAL = 0;
340 private final static int ENCODING_UTF_8 = 1;
341 private final static int ENCODING_ISO_8859_1 = 2;
342 private final static int ENCODING_UCS_2_12 = 3;
343 private final static int ENCODING_UCS_2_21 = 4;
344 private final static int ENCODING_UCS_4_1234 = 5;
345 private final static int ENCODING_UCS_4_4321 = 6;
346 private final static int ENCODING_UCS_4_2143 = 7;
347 private final static int ENCODING_UCS_4_3412 = 8;
348 private final static int ENCODING_ASCII = 9;
349
350
351
352
353
354
355 /***
356 * Constant: the attribute is not declared.
357 * @see #getAttributeDefaultValueType
358 */
359 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
360
361 /***
362 * Constant: the attribute has a literal default value specified.
363 * @see #getAttributeDefaultValueType
364 * @see #getAttributeDefaultValue
365 */
366 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
367
368 /***
369 * Constant: the attribute was declared #IMPLIED.
370 * @see #getAttributeDefaultValueType
371 */
372 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
373
374 /***
375 * Constant: the attribute was declared #REQUIRED.
376 * @see #getAttributeDefaultValueType
377 */
378 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
379
380 /***
381 * Constant: the attribute was declared #FIXED.
382 * @see #getAttributeDefaultValueType
383 * @see #getAttributeDefaultValue
384 */
385 public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
386
387
388
389
390
391 private final static int INPUT_NONE = 0;
392 private final static int INPUT_INTERNAL = 1;
393 private final static int INPUT_EXTERNAL = 2;
394 private final static int INPUT_STREAM = 3;
395 private final static int INPUT_BUFFER = 4;
396 private final static int INPUT_READER = 5;
397
398
399
400
401
402
403 private final static int LIT_ENTITY_REF = 2;
404
405 private final static int LIT_NORMALIZE = 4;
406
407 private final static int LIT_ATTRIBUTE = 8;
408
409 private final static int LIT_DISABLE_PE = 16;
410
411 private final static int LIT_DISABLE_CREF = 32;
412
413 private final static int LIT_DISABLE_EREF = 64;
414
415 private final static int LIT_ENTITY_CHECK = 128;
416
417
418
419
420
421
422 private final static int CONTEXT_NORMAL = 0;
423 private final static int CONTEXT_LITERAL = 1;
424
425
426
427
428
429
430
431 /***
432 * Report an error.
433 * @param message The error message.
434 * @param textFound The text that caused the error (or null).
435 * @see SAXDriver#error
436 * @see #line
437 */
438 private void error (String message, String textFound, String textExpected)
439 throws SAXException
440 {
441 if (textFound != null) {
442 message = message + " (found \"" + textFound + "\")";
443 }
444 if (textExpected != null) {
445 message = message + " (expected \"" + textExpected + "\")";
446 }
447 String uri = null;
448
449 if (externalEntity != null) {
450 uri = externalEntity.getURL ().toString ();
451 }
452 handler.error (message, uri, line, column);
453
454
455 throw new SAXException (message);
456 }
457
458
459 /***
460 * Report a serious error.
461 * @param message The error message.
462 * @param textFound The text that caused the error (or null).
463 */
464 private void error (String message, char textFound, String textExpected)
465 throws SAXException
466 {
467 error (message, new Character (textFound).toString (), textExpected);
468 }
469
470 /*** Report typical case fatal errors. */
471 private void error (String message)
472 throws SAXException
473 {
474 error (message, null, null);
475 }
476
477
478
479
480
481
482
483 /***
484 * Parse an XML document.
485 * <pre>
486 * [1] document ::= prolog element Misc*
487 * </pre>
488 * <p>This is the top-level parsing function for a single XML
489 * document. As a minimum, a well-formed document must have
490 * a document element, and a valid document must have a prolog
491 * (one with doctype) as well.
492 */
493 private void parseDocument ()
494 throws Exception
495 {
496 char c;
497 try {
498 parseProlog ();
499 require ('<');
500 parseElement ();
501 } catch (EOFException ee) {
502 error("premature end of file", "[EOF]", null);
503 }
504
505 try {
506 parseMisc ();
507 c = readCh ();
508 error ("unexpected characters after document end", c, null);
509 } catch (EOFException e) {
510 return;
511 }
512 }
513
514
515 /***
516 * Skip a comment.
517 * <pre>
518 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
519 * </pre>
520 * <p> (The <code><!--</code> has already been read.)
521 */
522 private void parseComment ()
523 throws Exception
524 {
525 char c;
526 boolean saved = expandPE;
527
528 expandPE = false;
529 parseUntil ("--");
530 require ('>');
531 expandPE = saved;
532 handler.comment (dataBuffer, 0, dataBufferPos);
533 dataBufferPos = 0;
534 }
535
536
537 /***
538 * Parse a processing instruction and do a call-back.
539 * <pre>
540 * [16] PI ::= '<?' PITarget
541 * (S (Char* - (Char* '?>' Char*)))?
542 * '?>'
543 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
544 * </pre>
545 * <p> (The <code><?</code> has already been read.)
546 */
547 private void parsePI ()
548 throws SAXException, IOException
549 {
550 String name;
551 boolean saved = expandPE;
552
553 expandPE = false;
554 name = readNmtoken (true);
555 if ("xml".equalsIgnoreCase (name))
556 error ("Illegal processing instruction target", name, null);
557 if (!tryRead ("?>")) {
558 requireWhitespace ();
559 parseUntil ("?>");
560 }
561 expandPE = saved;
562 handler.processingInstruction (name, dataBufferToString ());
563 }
564
565
566 /***
567 * Parse a CDATA section.
568 * <pre>
569 * [18] CDSect ::= CDStart CData CDEnd
570 * [19] CDStart ::= '<![CDATA['
571 * [20] CData ::= (Char* - (Char* ']]>' Char*))
572 * [21] CDEnd ::= ']]>'
573 * </pre>
574 * <p> (The '<![CDATA[' has already been read.)
575 */
576 private void parseCDSect ()
577 throws Exception
578 {
579 parseUntil ("]]>");
580 dataBufferFlush ();
581 }
582
583
584 /***
585 * Parse the prolog of an XML document.
586 * <pre>
587 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
588 * </pre>
589 * <p>There are a couple of tricks here. First, it is necessary to
590 * declare the XML default attributes after the DTD (if present)
591 * has been read. [??] Second, it is not possible to expand general
592 * references in attribute value literals until after the entire
593 * DTD (if present) has been parsed.
594 * <p>We do not look for the XML declaration here, because it was
595 * handled by pushURL ().
596 * @see pushURL
597 */
598 private void parseProlog ()
599 throws Exception
600 {
601 parseMisc ();
602
603 if (tryRead ("<!DOCTYPE")) {
604 parseDoctypedecl ();
605 parseMisc ();
606 }
607 }
608
609
610 /***
611 * Parse the XML declaration.
612 * <pre>
613 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
614 * [24] VersionInfo ::= S 'version' Eq
615 * ("'" VersionNum "'" | '"' VersionNum '"' )
616 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
617 * [32] SDDecl ::= S 'standalone' Eq
618 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
619 * [80] EncodingDecl ::= S 'encoding' Eq
620 * ( "'" EncName "'" | "'" EncName "'" )
621 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
622 * </pre>
623 * <p> (The <code><?xml</code> and whitespace have already been read.)
624 * @return the encoding in the declaration, uppercased; or null
625 * @see #parseTextDecl
626 * @see #setupDecoding
627 */
628 private String parseXMLDecl (boolean ignoreEncoding)
629 throws SAXException, IOException
630 {
631 String version;
632 String encodingName = null;
633 String standalone = null;
634 boolean white;
635 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
636
637
638 require ("version");
639 parseEq ();
640 version = readLiteral (flags);
641 if (!version.equals ("1.0")) {
642 error ("unsupported XML version", version, "1.0");
643 }
644
645
646 white = tryWhitespace ();
647 if (tryRead ("encoding")) {
648 if (!white)
649 error ("whitespace required before 'encoding='");
650 parseEq ();
651 encodingName = readLiteral (flags);
652 if (!ignoreEncoding)
653 setupDecoding (encodingName);
654 }
655
656
657 if (encodingName != null)
658 white = tryWhitespace ();
659 if (tryRead ("standalone")) {
660 if (!white)
661 error ("whitespace required before 'standalone='");
662 parseEq ();
663 standalone = readLiteral (flags);
664 if (! ("yes".equals (standalone) || "no".equals (standalone)))
665 error ("standalone flag must be 'yes' or 'no'");
666 }
667
668 skipWhitespace ();
669 require ("?>");
670
671 return encodingName;
672 }
673
674
675 /***
676 * Parse a text declaration.
677 * <pre>
678 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
679 * [80] EncodingDecl ::= S 'encoding' Eq
680 * ( '"' EncName '"' | "'" EncName "'" )
681 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
682 * </pre>
683 * <p> (The <code><?xml</code>' and whitespace have already been read.)
684 * @return the encoding in the declaration, uppercased; or null
685 * @see #parseXMLDecl
686 * @see #setupDecoding
687 */
688 private String parseTextDecl (boolean ignoreEncoding)
689 throws SAXException, IOException
690 {
691 String encodingName = null;
692 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
693
694
695 if (tryRead ("version")) {
696 String version;
697 parseEq ();
698 version = readLiteral (flags);
699 if (!version.equals ("1.0")) {
700 error ("unsupported XML version", version, "1.0");
701 }
702 requireWhitespace ();
703 }
704
705
706
707 require ("encoding");
708 parseEq ();
709 encodingName = readLiteral (flags);
710 if (!ignoreEncoding)
711 setupDecoding (encodingName);
712
713 skipWhitespace ();
714 require ("?>");
715
716 return encodingName;
717 }
718
719
720 /***
721 * Sets up internal state so that we can decode an entity using the
722 * specified encoding. This is used when we start to read an entity
723 * and we have been given knowledge of its encoding before we start to
724 * read any data (e.g. from a SAX input source or from a MIME type).
725 *
726 * <p> It is also used after autodetection, at which point only very
727 * limited adjustments to the encoding may be used (switching between
728 * related builtin decoders).
729 *
730 * @param encodingName The name of the encoding specified by the user.
731 * @exception IOException if the encoding isn't supported either
732 * internally to this parser, or by the hosting JVM.
733 * @see #parseXMLDecl
734 * @see #parseTextDecl
735 */
736 private void setupDecoding (String encodingName)
737 throws SAXException, IOException
738 {
739 encodingName = encodingName.toUpperCase ();
740
741
742
743
744
745
746
747
748
749 if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
750 if (encodingName.equals ("ISO-8859-1")
751 || encodingName.equals ("8859_1")
752 || encodingName.equals ("ISO8859_1")
753 ) {
754 encoding = ENCODING_ISO_8859_1;
755 return;
756 } else if (encodingName.equals ("US-ASCII")
757 || encodingName.equals ("ASCII")) {
758 encoding = ENCODING_ASCII;
759 return;
760 } else if (encodingName.equals ("UTF-8")
761 || encodingName.equals ("UTF8")) {
762 encoding = ENCODING_UTF_8;
763 return;
764 } else if (encoding != ENCODING_EXTERNAL) {
765
766 error ("unsupported ASCII-derived encoding",
767 encodingName,
768 "UTF-8, US-ASCII, or ISO-8859-1");
769 }
770
771
772 }
773
774
775 if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
776 if (!(encodingName.equals ("ISO-10646-UCS-2")
777 || encodingName.equals ("UTF-16")
778 || encodingName.equals ("UTF-16BE")
779 || encodingName.equals ("UTF-16LE")))
780 error ("unsupported Unicode encoding",
781 encodingName,
782 "UTF-16");
783 return;
784 }
785
786
787 if (encoding == ENCODING_UCS_4_1234
788 || encoding == ENCODING_UCS_4_4321
789 || encoding == ENCODING_UCS_4_2143
790 || encoding == ENCODING_UCS_4_3412) {
791 if (!encodingName.equals ("ISO-10646-UCS-4"))
792 error ("unsupported 32-bit encoding",
793 encodingName,
794 "ISO-10646-UCS-4");
795 return;
796 }
797
798
799
800
801
802 if (encodingName.equals ("UTF-16BE")) {
803 encoding = ENCODING_UCS_2_12;
804 return;
805 }
806 if (encodingName.equals ("UTF-16LE")) {
807 encoding = ENCODING_UCS_2_21;
808 return;
809 }
810
811
812
813
814
815 if (encodingName.equals ("UTF-16")
816 || encodingName.equals ("ISO-10646-UCS-2"))
817 encodingName = "Unicode";
818
819
820 reader = new InputStreamReader (is, encodingName);
821 sourceType = INPUT_READER;
822 is = null;
823 }
824
825
826 /***
827 * Parse miscellaneous markup outside the document element and DOCTYPE
828 * declaration.
829 * <pre>
830 * [27] Misc ::= Comment | PI | S
831 * </pre>
832 */
833 private void parseMisc ()
834 throws Exception
835 {
836 while (true) {
837 skipWhitespace ();
838 if (tryRead ("<?")) {
839 parsePI ();
840 } else if (tryRead ("<!--")) {
841 parseComment ();
842 } else {
843 return;
844 }
845 }
846 }
847
848
849 /***
850 * Parse a document type declaration.
851 * <pre>
852 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
853 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
854 * </pre>
855 * <p> (The <code><!DOCTYPE</code> has already been read.)
856 */
857 private void parseDoctypedecl ()
858 throws Exception
859 {
860 char c;
861 String doctypeName, ids[];
862
863
864 requireWhitespace ();
865 doctypeName = readNmtoken (true);
866
867
868 skipWhitespace ();
869 ids = readExternalIds (false);
870
871
872 handler.doctypeDecl (doctypeName, ids [0], ids [1]);
873
874
875 skipWhitespace ();
876 if (tryRead ('[')) {
877
878
879 while (true) {
880 expandPE = true;
881 skipWhitespace ();
882 expandPE = false;
883 if (tryRead (']')) {
884 break;
885 } else {
886
887 peIsError = expandPE = true;
888 parseMarkupdecl ();
889 peIsError = expandPE = false;
890 }
891 }
892 }
893
894
895 if (ids [1] != null) {
896 pushURL ("[external subset]", ids [0], ids [1], null, null, null);
897
898
899 while (true) {
900 expandPE = true;
901 skipWhitespace ();
902 expandPE = false;
903 if (tryRead ('>')) {
904 break;
905 } else {
906 expandPE = true;
907 parseMarkupdecl ();
908 expandPE = false;
909 }
910 }
911 } else {
912
913 skipWhitespace ();
914 require ('>');
915 }
916
917
918 handler.endDoctype ();
919 expandPE = false;
920 }
921
922
923 /***
924 * Parse a markup declaration in the internal or external DTD subset.
925 * <pre>
926 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
927 * | NotationDecl | PI | Comment
928 * [30] extSubsetDecl ::= (markupdecl | conditionalSect
929 * | PEReference | S) *
930 * </pre>
931 * <p> Reading toplevel PE references is handled as a lexical issue
932 * by the caller, as is whitespace.
933 */
934 private void parseMarkupdecl ()
935 throws Exception
936 {
937 if (tryRead ("<!ELEMENT")) {
938 parseElementdecl ();
939 } else if (tryRead ("<!ATTLIST")) {
940 parseAttlistDecl ();
941 } else if (tryRead ("<!ENTITY")) {
942 parseEntityDecl ();
943 } else if (tryRead ("<!NOTATION")) {
944 parseNotationDecl ();
945 } else if (tryRead ("<?")) {
946 parsePI ();
947 } else if (tryRead ("<!--")) {
948 parseComment ();
949 } else if (tryRead ("<![")) {
950 if (inputStack.size () > 0)
951 parseConditionalSect ();
952 else
953 error ("conditional sections illegal in internal subset");
954 } else {
955 error ("expected markup declaration");
956 }
957 }
958
959
960 /***
961 * Parse an element, with its tags.
962 * <pre>
963 * [39] element ::= EmptyElementTag | STag content ETag
964 * [40] STag ::= '<' Name (S Attribute)* S? '>'
965 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>'
966 * </pre>
967 * <p> (The '<' has already been read.)
968 * <p>NOTE: this method actually chains onto parseContent (), if necessary,
969 * and parseContent () will take care of calling parseETag ().
970 */
971 private void parseElement ()
972 throws Exception
973 {
974 String gi;
975 char c;
976 int oldElementContent = currentElementContent;
977 String oldElement = currentElement;
978 Object element [];
979
980
981
982 tagAttributePos = 0;
983
984
985 gi = readNmtoken (true);
986
987
988 currentElement = gi;
989 element = (Object []) elementInfo.get (gi);
990 currentElementContent = getContentType (element, CONTENT_ANY);
991
992
993
994 boolean white = tryWhitespace ();
995 c = readCh ();
996 while (c != '/' && c != '>') {
997 unread (c);
998 if (!white)
999 error ("need whitespace between attributes");
1000 parseAttribute (gi);
1001 white = tryWhitespace ();
1002 c = readCh ();
1003 }
1004
1005
1006 Iterator atts = declaredAttributes (element);
1007 if (atts != null) {
1008 String aname;
1009 loop:
1010 while (atts.hasNext ()) {
1011 aname = (String) atts.next ();
1012
1013 for (int i = 0; i < tagAttributePos; i++) {
1014 if (tagAttributes [i] == aname) {
1015 continue loop;
1016 }
1017 }
1018
1019 handler.attribute (aname,
1020 getAttributeExpandedValue (gi, aname),
1021 false);
1022 }
1023 }
1024
1025
1026
1027
1028 switch (c) {
1029 case '>':
1030 handler.startElement (gi);
1031 parseContent ();
1032 break;
1033 case '/':
1034 require ('>');
1035 handler.startElement (gi);
1036 handler.endElement (gi);
1037 break;
1038 }
1039
1040
1041 currentElement = oldElement;
1042 currentElementContent = oldElementContent;
1043 }
1044
1045
1046 /***
1047 * Parse an attribute assignment.
1048 * <pre>
1049 * [41] Attribute ::= Name Eq AttValue
1050 * </pre>
1051 * @param name The name of the attribute's element.
1052 * @see SAXDriver#attribute
1053 */
1054 private void parseAttribute (String name)
1055 throws Exception
1056 {
1057 String aname;
1058 int type;
1059 String value;
1060 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF;
1061
1062
1063 aname = readNmtoken (true);
1064 type = getAttributeType (name, aname);
1065
1066
1067 parseEq ();
1068
1069
1070
1071 if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
1072 value = readLiteral (flags);
1073 } else {
1074 value = readLiteral (flags | LIT_NORMALIZE);
1075 }
1076
1077
1078 for (int i = 0; i < tagAttributePos; i++)
1079 if (aname.equals (tagAttributes [i]))
1080 error ("duplicate attribute", aname, null);
1081
1082
1083
1084 handler.attribute (aname, value, true);
1085 dataBufferPos = 0;
1086
1087
1088
1089 if (tagAttributePos == tagAttributes.length) {
1090 String newAttrib[] = new String [tagAttributes.length * 2];
1091 System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos);
1092 tagAttributes = newAttrib;
1093 }
1094 tagAttributes [tagAttributePos++] = aname;
1095 }
1096
1097
1098 /***
1099 * Parse an equals sign surrounded by optional whitespace.
1100 * <pre>
1101 * [25] Eq ::= S? '=' S?
1102 * </pre>
1103 */
1104 private void parseEq ()
1105 throws SAXException, IOException
1106 {
1107 skipWhitespace ();
1108 require ('=');
1109 skipWhitespace ();
1110 }
1111
1112
1113 /***
1114 * Parse an end tag.
1115 * <pre>
1116 * [42] ETag ::= '</' Name S? '>'
1117 * </pre>
1118 * <p>NOTE: parseContent () chains to here, we already read the
1119 * "</".
1120 */
1121 private void parseETag ()
1122 throws Exception
1123 {
1124 require (currentElement);
1125 skipWhitespace ();
1126 require ('>');
1127 handler.endElement (currentElement);
1128
1129
1130 }
1131
1132
1133 /***
1134 * Parse the content of an element.
1135 * <pre>
1136 * [43] content ::= (element | CharData | Reference
1137 * | CDSect | PI | Comment)*
1138 * [67] Reference ::= EntityRef | CharRef
1139 * </pre>
1140 * <p> NOTE: consumes ETtag.
1141 */
1142 private void parseContent ()
1143 throws Exception
1144 {
1145 String data;
1146 char c;
1147
1148 while (true) {
1149 switch (currentElementContent) {
1150 case CONTENT_ANY:
1151 case CONTENT_MIXED:
1152 case CONTENT_UNDECLARED:
1153 case CONTENT_EMPTY:
1154 parseCharData ();
1155 break;
1156 case CONTENT_ELEMENTS:
1157 parseWhitespace ();
1158 break;
1159 }
1160
1161
1162 c = readCh ();
1163 switch (c) {
1164
1165 case '&':
1166
1167 c = readCh ();
1168 if (c == '#') {
1169 parseCharRef ();
1170 } else {
1171 unread (c);
1172 parseEntityRef (true);
1173 }
1174 break;
1175
1176 case '<':
1177 dataBufferFlush ();
1178 c = readCh ();
1179 switch (c) {
1180 case '!':
1181 c = readCh ();
1182 switch (c) {
1183 case '-':
1184 require ('-');
1185 parseComment ();
1186 break;
1187 case '[':
1188 require ("CDATA[");
1189 handler.startCDATA ();
1190 inCDATA = true;
1191 parseCDSect ();
1192 inCDATA = false;
1193 handler.endCDATA ();
1194 break;
1195 default:
1196 error ("expected comment or CDATA section", c, null);
1197 break;
1198 }
1199 break;
1200
1201 case '?':
1202 parsePI ();
1203 break;
1204
1205 case '/':
1206 parseETag ();
1207 return;
1208
1209 default:
1210 unread (c);
1211 parseElement ();
1212 break;
1213 }
1214 }
1215 }
1216 }
1217
1218
1219 /***
1220 * Parse an element type declaration.
1221 * <pre>
1222 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1223 * </pre>
1224 * <p> NOTE: the '<!ELEMENT' has already been read.
1225 */
1226 private void parseElementdecl ()
1227 throws Exception
1228 {
1229 String name;
1230
1231 requireWhitespace ();
1232
1233 name = readNmtoken (true);
1234
1235 requireWhitespace ();
1236
1237 parseContentspec (name);
1238
1239 skipWhitespace ();
1240 require ('>');
1241 }
1242
1243
1244 /***
1245 * Content specification.
1246 * <pre>
1247 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1248 * </pre>
1249 */
1250 private void parseContentspec (String name)
1251 throws Exception
1252 {
1253 if (tryRead ("EMPTY")) {
1254 setElement (name, CONTENT_EMPTY, null, null);
1255 return;
1256 } else if (tryRead ("ANY")) {
1257 setElement (name, CONTENT_ANY, null, null);
1258 return;
1259 } else {
1260 require ('(');
1261 dataBufferAppend ('(');
1262 skipWhitespace ();
1263 if (tryRead ("#PCDATA")) {
1264 dataBufferAppend ("#PCDATA");
1265 parseMixed ();
1266 setElement (name, CONTENT_MIXED, dataBufferToString (), null);
1267 } else {
1268 parseElements ();
1269 setElement (name, CONTENT_ELEMENTS,
1270 dataBufferToString (), null);
1271 }
1272 }
1273 }
1274
1275
1276 /***
1277 * Parse an element-content model.
1278 * <pre>
1279 * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1280 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1281 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1282 * </pre>
1283 *
1284 * <p> NOTE: the opening '(' and S have already been read.
1285 */
1286 private void parseElements ()
1287 throws Exception
1288 {
1289 char c;
1290 char sep;
1291
1292
1293 skipWhitespace ();
1294 parseCp ();
1295
1296
1297 skipWhitespace ();
1298 c = readCh ();
1299 switch (c) {
1300 case ')':
1301 dataBufferAppend (')');
1302 c = readCh ();
1303 switch (c) {
1304 case '*':
1305 case '+':
1306 case '?':
1307 dataBufferAppend (c);
1308 break;
1309 default:
1310 unread (c);
1311 }
1312 return;
1313 case ',':
1314 case '|':
1315 sep = c;
1316 dataBufferAppend (c);
1317 break;
1318 default:
1319 error ("bad separator in content model", c, null);
1320 return;
1321 }
1322
1323
1324 while (true) {
1325 skipWhitespace ();
1326 parseCp ();
1327 skipWhitespace ();
1328 c = readCh ();
1329 if (c == ')') {
1330 dataBufferAppend (')');
1331 break;
1332 } else if (c != sep) {
1333 error ("bad separator in content model", c, null);
1334 return;
1335 } else {
1336 dataBufferAppend (c);
1337 }
1338 }
1339
1340
1341 c = readCh ();
1342 switch (c) {
1343 case '?':
1344 case '*':
1345 case '+':
1346 dataBufferAppend (c);
1347 return;
1348 default:
1349 unread (c);
1350 return;
1351 }
1352 }
1353
1354
1355 /***
1356 * Parse a content particle.
1357 * <pre>
1358 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1359 * </pre>
1360 */
1361 private void parseCp ()
1362 throws Exception
1363 {
1364 char c;
1365
1366 if (tryRead ('(')) {
1367 dataBufferAppend ('(');
1368 parseElements ();
1369 } else {
1370 dataBufferAppend (readNmtoken (true));
1371 c = readCh ();
1372 switch (c) {
1373 case '?':
1374 case '*':
1375 case '+':
1376 dataBufferAppend (c);
1377 break;
1378 default:
1379 unread (c);
1380 break;
1381 }
1382 }
1383 }
1384
1385
1386 /***
1387 * Parse mixed content.
1388 * <pre>
1389 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1390 * | '(' S? ('#PCDATA') S? ')'
1391 * </pre>
1392 */
1393 private void parseMixed ()
1394 throws Exception
1395 {
1396 char c;
1397
1398
1399 skipWhitespace ();
1400 if (tryRead (')')) {
1401 dataBufferAppend (")");
1402 if (tryRead ('*')) {
1403 dataBufferAppend("*");
1404 }
1405 return;
1406 }
1407
1408
1409 skipWhitespace ();
1410 while (!tryRead (")*")) {
1411 require ('|');
1412 dataBufferAppend ('|');
1413 skipWhitespace ();
1414 dataBufferAppend (readNmtoken (true));
1415 skipWhitespace ();
1416 }
1417 dataBufferAppend (")*");
1418 }
1419
1420
1421 /***
1422 * Parse an attribute list declaration.
1423 * <pre>
1424 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1425 * </pre>
1426 * <p>NOTE: the '<!ATTLIST' has already been read.
1427 */
1428 private void parseAttlistDecl ()
1429 throws Exception
1430 {
1431 String elementName;
1432
1433 requireWhitespace ();
1434 elementName = readNmtoken (true);
1435 boolean white = tryWhitespace ();
1436 while (!tryRead ('>')) {
1437 if (!white)
1438 error ("whitespace required before attribute definition");
1439 parseAttDef (elementName);
1440 white = tryWhitespace ();
1441 }
1442 }
1443
1444
1445 /***
1446 * Parse a single attribute definition.
1447 * <pre>
1448 * [53] AttDef ::= S Name S AttType S DefaultDecl
1449 * </pre>
1450 */
1451 private void parseAttDef (String elementName)
1452 throws Exception
1453 {
1454 String name;
1455 int type;
1456 String enumer = null;
1457
1458
1459 name = readNmtoken (true);
1460
1461
1462 requireWhitespace ();
1463 type = readAttType ();
1464
1465
1466
1467 if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1468 enumer = dataBufferToString ();
1469 }
1470
1471
1472 requireWhitespace ();
1473 parseDefault (elementName, name, type, enumer);
1474 }
1475
1476
1477 /***
1478 * Parse the attribute type.
1479 * <pre>
1480 * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1481 * [55] StringType ::= 'CDATA'
1482 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1483 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1484 * [57] EnumeratedType ::= NotationType | Enumeration
1485 * </pre>
1486 */
1487 private int readAttType ()
1488 throws Exception
1489 {
1490 String typeString;
1491 Integer type;
1492
1493 if (tryRead ('(')) {
1494 parseEnumeration (false);
1495 return ATTRIBUTE_ENUMERATED;
1496 } else {
1497 typeString = readNmtoken (true);
1498 if (typeString.equals ("NOTATION")) {
1499 parseNotationType ();
1500 }
1501 type = (Integer) attributeTypeHash.get (typeString);
1502 if (type == null) {
1503 error ("illegal attribute type", typeString, null);
1504 return ATTRIBUTE_UNDECLARED;
1505 } else {
1506 return type.intValue ();
1507 }
1508 }
1509 }
1510
1511
1512 /***
1513 * Parse an enumeration.
1514 * <pre>
1515 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1516 * </pre>
1517 * <p>NOTE: the '(' has already been read.
1518 */
1519 private void parseEnumeration (boolean isNames)
1520 throws Exception
1521 {
1522 char c;
1523
1524 dataBufferAppend ('(');
1525
1526
1527 skipWhitespace ();
1528 dataBufferAppend (readNmtoken (isNames));
1529
1530 skipWhitespace ();
1531 while (!tryRead (')')) {
1532 require ('|');
1533 dataBufferAppend ('|');
1534 skipWhitespace ();
1535 dataBufferAppend (readNmtoken (isNames));
1536 skipWhitespace ();
1537 }
1538 dataBufferAppend (')');
1539 }
1540
1541
1542 /***
1543 * Parse a notation type for an attribute.
1544 * <pre>
1545 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1546 * (S? '|' S? name)* S? ')'
1547 * </pre>
1548 * <p>NOTE: the 'NOTATION' has already been read
1549 */
1550 private void parseNotationType ()
1551 throws Exception
1552 {
1553 requireWhitespace ();
1554 require ('(');
1555
1556 parseEnumeration (true);
1557 }
1558
1559
1560 /***
1561 * Parse the default value for an attribute.
1562 * <pre>
1563 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1564 * | (('#FIXED' S)? AttValue)
1565 * </pre>
1566 */
1567 private void parseDefault (
1568 String elementName,
1569 String name,
1570 int type,
1571 String enumer
1572 ) throws Exception
1573 {
1574 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1575 String value = null;
1576 int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK;
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587 if (tryRead ('#')) {
1588 if (tryRead ("FIXED")) {
1589 valueType = ATTRIBUTE_DEFAULT_FIXED;
1590 requireWhitespace ();
1591 value = readLiteral (flags);
1592 } else if (tryRead ("REQUIRED")) {
1593 valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1594 } else if (tryRead ("IMPLIED")) {
1595 valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1596 } else {
1597 error ("illegal keyword for attribute default value");
1598 }
1599 } else
1600 value = readLiteral (flags);
1601 setAttribute (elementName, name, type, enumer, value, valueType);
1602 }
1603
1604
1605 /***
1606 * Parse a conditional section.
1607 * <pre>
1608 * [61] conditionalSect ::= includeSect || ignoreSect
1609 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
1610 * extSubsetDecl ']]>'
1611 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
1612 * ignoreSectContents* ']]>'
1613 * [64] ignoreSectContents ::= Ignore
1614 * ('<![' ignoreSectContents* ']]>' Ignore )*
1615 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* )
1616 * </pre>
1617 * <p> NOTE: the '>![' has already been read.
1618 */
1619 private void parseConditionalSect ()
1620 throws Exception
1621 {
1622 skipWhitespace ();
1623 if (tryRead ("INCLUDE")) {
1624 skipWhitespace ();
1625 require ('[');
1626 skipWhitespace ();
1627 while (!tryRead ("]]>")) {
1628 parseMarkupdecl ();
1629 skipWhitespace ();
1630 }
1631 } else if (tryRead ("IGNORE")) {
1632 skipWhitespace ();
1633 require ('[');
1634 int nesting = 1;
1635 char c;
1636 expandPE = false;
1637 for (int nest = 1; nest > 0;) {
1638 c = readCh ();
1639 switch (c) {
1640 case '<':
1641 if (tryRead ("![")) {
1642 nest++;
1643 }
1644 case ']':
1645 if (tryRead ("]>")) {
1646 nest--;
1647 }
1648 }
1649 }
1650 expandPE = true;
1651 } else {
1652 error ("conditional section must begin with INCLUDE or IGNORE");
1653 }
1654 }
1655
1656
1657 /***
1658 * Read and interpret a character reference.
1659 * <pre>
1660 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1661 * </pre>
1662 * <p>NOTE: the '&#' has already been read.
1663 */
1664 private void parseCharRef ()
1665 throws SAXException, IOException
1666 {
1667 int value = 0;
1668 char c;
1669
1670 if (tryRead ('x')) {
1671 loop1:
1672 while (true) {
1673 c = readCh ();
1674 switch (c) {
1675 case '0':
1676 case '1':
1677 case '2':
1678 case '3':
1679 case '4':
1680 case '5':
1681 case '6':
1682 case '7':
1683 case '8':
1684 case '9':
1685 case 'a':
1686 case 'A':
1687 case 'b':
1688 case 'B':
1689 case 'c':
1690 case 'C':
1691 case 'd':
1692 case 'D':
1693 case 'e':
1694 case 'E':
1695 case 'f':
1696 case 'F':
1697 value *= 16;
1698 value += Integer.parseInt (new Character (c).toString (),
1699 16);
1700 break;
1701 case ';':
1702 break loop1;
1703 default:
1704 error ("illegal character in character reference", c, null);
1705 break loop1;
1706 }
1707 }
1708 } else {
1709 loop2:
1710 while (true) {
1711 c = readCh ();
1712 switch (c) {
1713 case '0':
1714 case '1':
1715 case '2':
1716 case '3':
1717 case '4':
1718 case '5':
1719 case '6':
1720 case '7':
1721 case '8':
1722 case '9':
1723 value *= 10;
1724 value += Integer.parseInt (new Character (c).toString (),
1725 10);
1726 break;
1727 case ';':
1728 break loop2;
1729 default:
1730 error ("illegal character in character reference", c, null);
1731 break loop2;
1732 }
1733 }
1734 }
1735
1736
1737 if ((value < 0x0020
1738 && ! (value == '\n' || value == '\t' || value == '\r'))
1739 || (value >= 0xD800 && value <= 0xDFFF)
1740 || value == 0xFFFE || value == 0xFFFF
1741 || value > 0x0010ffff)
1742 error ("illegal XML character reference U+"
1743 + Integer.toHexString (value));
1744
1745
1746
1747 if (value <= 0x0000ffff) {
1748
1749 dataBufferAppend ((char) value);
1750 } else if (value <= 0x0010ffff) {
1751 value -= 0x10000;
1752
1753 dataBufferAppend ((char) (0xd800 | (value >> 10)));
1754 dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff)));
1755 } else {
1756
1757 error ("character reference " + value + " is too large for UTF-16",
1758 new Integer (value).toString (), null);
1759 }
1760 }
1761
1762
1763 /***
1764 * Parse and expand an entity reference.
1765 * <pre>
1766 * [68] EntityRef ::= '&' Name ';'
1767 * </pre>
1768 * <p>NOTE: the '&' has already been read.
1769 * @param externalAllowed External entities are allowed here.
1770 */
1771 private void parseEntityRef (boolean externalAllowed)
1772 throws SAXException, IOException
1773 {
1774 String name;
1775
1776 name = readNmtoken (true);
1777 require (';');
1778 switch (getEntityType (name)) {
1779 case ENTITY_UNDECLARED:
1780 error ("reference to undeclared entity", name, null);
1781 break;
1782 case ENTITY_INTERNAL:
1783 pushString (name, getEntityValue (name));
1784 break;
1785 case ENTITY_TEXT:
1786 if (externalAllowed) {
1787 pushURL (name, getEntityPublicId (name),
1788 getEntitySystemId (name),
1789 null, null, null);
1790 } else {
1791 error ("reference to external entity in attribute value.",
1792 name, null);
1793 }
1794 break;
1795 case ENTITY_NDATA:
1796 if (externalAllowed) {
1797 error ("unparsed entity reference in content", name, null);
1798 } else {
1799 error ("reference to external entity in attribute value.",
1800 name, null);
1801 }
1802 break;
1803 }
1804 }
1805
1806
1807 /***
1808 * Parse and expand a parameter entity reference.
1809 * <pre>
1810 * [69] PEReference ::= '%' Name ';'
1811 * </pre>
1812 * <p>NOTE: the '%' has already been read.
1813 */
1814 private void parsePEReference ()
1815 throws SAXException, IOException
1816 {
1817 String name;
1818
1819 name = "%" + readNmtoken (true);
1820 require (';');
1821 switch (getEntityType (name)) {
1822 case ENTITY_UNDECLARED:
1823
1824
1825
1826
1827
1828 break;
1829 case ENTITY_INTERNAL:
1830 if (inLiteral)
1831 pushString (name, getEntityValue (name));
1832 else
1833 pushString (name, " " + getEntityValue (name) + ' ');
1834 break;
1835 case ENTITY_TEXT:
1836 if (!inLiteral)
1837 pushString (null, " ");
1838 pushURL (name, getEntityPublicId (name),
1839 getEntitySystemId (name),
1840 null, null, null);
1841 if (!inLiteral)
1842 pushString (null, " ");
1843 break;
1844 }
1845 }
1846
1847 /***
1848 * Parse an entity declaration.
1849 * <pre>
1850 * [70] EntityDecl ::= GEDecl | PEDecl
1851 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
1852 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
1853 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1854 * [74] PEDef ::= EntityValue | ExternalID
1855 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1856 * | 'PUBLIC' S PubidLiteral S SystemLiteral
1857 * [76] NDataDecl ::= S 'NDATA' S Name
1858 * </pre>
1859 * <p>NOTE: the '<!ENTITY' has already been read.
1860 */
1861 private void parseEntityDecl ()
1862 throws Exception
1863 {
1864 char c;
1865 boolean peFlag = false;
1866 String name, value, notationName, ids[];
1867
1868
1869 expandPE = false;
1870 requireWhitespace ();
1871 if (tryRead ('%')) {
1872 peFlag = true;
1873 requireWhitespace ();
1874 }
1875 expandPE = true;
1876
1877
1878
1879 name = readNmtoken (true);
1880 if (peFlag) {
1881 name = "%" + name;
1882 }
1883
1884
1885 requireWhitespace ();
1886 c = readCh ();
1887 unread (c);
1888 if (c == '"' || c == '\'') {
1889
1890
1891 value = readLiteral (0);
1892 setInternalEntity (name, value);
1893 } else {
1894
1895 ids = readExternalIds (false);
1896 if (ids [1] == null) {
1897 error ("system identifer missing", name, null);
1898 }
1899
1900
1901 boolean white = tryWhitespace ();
1902 if (!peFlag && tryRead ("NDATA")) {
1903 if (!white)
1904 error ("whitespace required before NDATA");
1905 requireWhitespace ();
1906 notationName = readNmtoken (true);
1907 setExternalDataEntity (name, ids [0], ids [1], notationName);
1908 } else {
1909 setExternalTextEntity (name, ids [0], ids [1]);
1910 }
1911 }
1912
1913
1914 skipWhitespace ();
1915 require ('>');
1916 }
1917
1918
1919 /***
1920 * Parse a notation declaration.
1921 * <pre>
1922 * [82] NotationDecl ::= '<!NOTATION' S Name S
1923 * (ExternalID | PublicID) S? '>'
1924 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1925 * </pre>
1926 * <P>NOTE: the '<!NOTATION' has already been read.
1927 */
1928 private void parseNotationDecl ()
1929 throws Exception
1930 {
1931 String nname, ids[];
1932
1933
1934 requireWhitespace ();
1935 nname = readNmtoken (true);
1936
1937 requireWhitespace ();
1938
1939
1940 ids = readExternalIds (true);
1941 if (ids [0] == null && ids [1] == null) {
1942 error ("external identifer missing", nname, null);
1943 }
1944
1945
1946 setNotation (nname, ids [0], ids [1]);
1947
1948 skipWhitespace ();
1949 require ('>');
1950 }
1951
1952
1953 /***
1954 * Parse character data.
1955 * <pre>
1956 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
1957 * </pre>
1958 */
1959 private void parseCharData ()
1960 throws Exception
1961 {
1962 char c;
1963
1964
1965
1966
1967
1968
1969 if (USE_CHEATS) {
1970 int lineAugment = 0;
1971 int columnAugment = 0;
1972
1973 loop:
1974 for (int i = readBufferPos; i < readBufferLength; i++) {
1975 switch (c = readBuffer [i]) {
1976 case '\n':
1977 lineAugment++;
1978 columnAugment = 0;
1979 break;
1980 case '&':
1981 case '<':
1982 int start = readBufferPos;
1983 columnAugment++;
1984 readBufferPos = i;
1985 if (lineAugment > 0) {
1986 line += lineAugment;
1987 column = columnAugment;
1988 } else {
1989 column += columnAugment;
1990 }
1991 dataBufferAppend (readBuffer, start, i - start);
1992 return;
1993 case ']':
1994
1995 if ((i + 2) < readBufferLength) {
1996 if (readBuffer [i + 1] == ']'
1997 && readBuffer [i + 2] == '>') {
1998 error ("character data may not contain ']]>'");
1999 }
2000 }
2001 columnAugment++;
2002 break;
2003 default:
2004 if (c < 0x0020 || c > 0xFFFD)
2005 error ("illegal XML character U+"
2006 + Integer.toHexString (c));
2007
2008 case '\r':
2009 case '\t':
2010 columnAugment++;
2011 }
2012 }
2013 }
2014
2015
2016
2017 while (true) {
2018 c = readCh ();
2019 switch (c) {
2020 case '<':
2021 case '&':
2022 unread (c);
2023 return;
2024
2025 default:
2026 dataBufferAppend (c);
2027 break;
2028 }
2029 }
2030 }
2031
2032
2033
2034
2035
2036
2037 /***
2038 * Require whitespace characters.
2039 */
2040 private void requireWhitespace ()
2041 throws SAXException, IOException
2042 {
2043 char c = readCh ();
2044 if (isWhitespace (c)) {
2045 skipWhitespace ();
2046 } else {
2047 error ("whitespace required", c, null);
2048 }
2049 }
2050
2051
2052 /***
2053 * Parse whitespace characters, and leave them in the data buffer.
2054 */
2055 private void parseWhitespace ()
2056 throws Exception
2057 {
2058 char c = readCh ();
2059 while (isWhitespace (c)) {
2060 dataBufferAppend (c);
2061 c = readCh ();
2062 }
2063 unread (c);
2064 }
2065
2066
2067 /***
2068 * Skip whitespace characters.
2069 * <pre>
2070 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2071 * </pre>
2072 */
2073 private void skipWhitespace ()
2074 throws SAXException, IOException
2075 {
2076
2077
2078
2079
2080 if (USE_CHEATS) {
2081 int lineAugment = 0;
2082 int columnAugment = 0;
2083
2084 loop:
2085 for (int i = readBufferPos; i < readBufferLength; i++) {
2086 switch (readBuffer [i]) {
2087 case ' ':
2088 case '\t':
2089 case '\r':
2090 columnAugment++;
2091 break;
2092 case '\n':
2093 lineAugment++;
2094 columnAugment = 0;
2095 break;
2096 case '%':
2097 if (expandPE)
2098 break loop;
2099
2100 default:
2101 readBufferPos = i;
2102 if (lineAugment > 0) {
2103 line += lineAugment;
2104 column = columnAugment;
2105 } else {
2106 column += columnAugment;
2107 }
2108 return;
2109 }
2110 }
2111 }
2112
2113
2114 char c = readCh ();
2115 while (isWhitespace (c)) {
2116 c = readCh ();
2117 }
2118 unread (c);
2119 }
2120
2121
2122 /***
2123 * Read a name or (when parsing an enumeration) name token.
2124 * <pre>
2125 * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2126 * [7] Nmtoken ::= (NameChar)+
2127 * </pre>
2128 */
2129 private String readNmtoken (boolean isName)
2130 throws SAXException, IOException
2131 {
2132 char c;
2133
2134 if (USE_CHEATS) {
2135 loop:
2136 for (int i = readBufferPos; i < readBufferLength; i++) {
2137 c = readBuffer [i];
2138 switch (c) {
2139 case '%':
2140 if (expandPE)
2141 break loop;
2142
2143
2144
2145 case '<': case '>': case '&':
2146 case ',': case '|': case '*': case '+': case '?':
2147 case ')':
2148 case '=':
2149 case '\'': case '"':
2150 case '[':
2151 case ' ': case '\t': case '\r': case '\n':
2152 case ';':
2153 case '/':
2154 int start = readBufferPos;
2155 if (i == start)
2156 error ("name expected", readBuffer [i], null);
2157 readBufferPos = i;
2158 return intern (readBuffer, start, i - start);
2159
2160 default:
2161
2162
2163 if (i == readBufferPos && isName) {
2164 if (!Character.isUnicodeIdentifierStart (c)
2165 && c != ':' && c != '_')
2166 error ("Not a name start character, U+"
2167 + Integer.toHexString (c));
2168 } else if (!Character.isUnicodeIdentifierPart (c)
2169 && c != '-' && c != ':' && c != '_' && c != '.'
2170 && !isExtender (c))
2171 error ("Not a name character, U+"
2172 + Integer.toHexString (c));
2173 }
2174 }
2175 }
2176
2177 nameBufferPos = 0;
2178
2179
2180 loop:
2181 while (true) {
2182 c = readCh ();
2183 switch (c) {
2184 case '%':
2185 case '<': case '>': case '&':
2186 case ',': case '|': case '*': case '+': case '?':
2187 case ')':
2188 case '=':
2189 case '\'': case '"':
2190 case '[':
2191 case ' ': case '\t': case '\n': case '\r':
2192 case ';':
2193 case '/':
2194 unread (c);
2195 if (nameBufferPos == 0) {
2196 error ("name expected");
2197 }
2198
2199 if (isName
2200 && !Character.isUnicodeIdentifierStart (
2201 nameBuffer [0])
2202 && ":_".indexOf (nameBuffer [0]) == -1)
2203 error ("Not a name start character, U+"
2204 + Integer.toHexString (nameBuffer [0]));
2205 String s = intern (nameBuffer, 0, nameBufferPos);
2206 nameBufferPos = 0;
2207 return s;
2208 default:
2209
2210
2211 if ((nameBufferPos != 0 || !isName)
2212 && !Character.isUnicodeIdentifierPart (c)
2213 && ":-_.".indexOf (c) == -1
2214 && !isExtender (c))
2215 error ("Not a name character, U+"
2216 + Integer.toHexString (c));
2217 if (nameBufferPos >= nameBuffer.length)
2218 nameBuffer =
2219 (char[]) extendArray (nameBuffer,
2220 nameBuffer.length, nameBufferPos);
2221 nameBuffer [nameBufferPos++] = c;
2222 }
2223 }
2224 }
2225
2226 private static boolean isExtender (char c)
2227 {
2228
2229 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2230 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2231 || (c >= 0x3031 && c <= 0x3035)
2232 || (c >= 0x309d && c <= 0x309e)
2233 || (c >= 0x30fc && c <= 0x30fe);
2234 }
2235
2236
2237 /***
2238 * Read a literal. With matching single or double quotes as
2239 * delimiters (and not embedded!) this is used to parse:
2240 * <pre>
2241 * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ...
2242 * [10] AttValue ::= ... ([^<&] | Reference)* ...
2243 * [11] SystemLiteral ::= ... (URLchar - "'")* ...
2244 * [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2245 * </pre>
2246 * as well as the quoted strings in XML and text declarations
2247 * (for version, encoding, and standalone) which have their
2248 * own constraints.
2249 */
2250 private String readLiteral (int flags)
2251 throws SAXException, IOException
2252 {
2253 char delim, c;
2254 int startLine = line;
2255 boolean saved = expandPE;
2256
2257
2258 delim = readCh ();
2259 if (delim != '"' && delim != '\'' && delim != (char) 0) {
2260 error ("expected '\"' or \"'\"", delim, null);
2261 return null;
2262 }
2263 inLiteral = true;
2264 if ((flags & LIT_DISABLE_PE) != 0)
2265 expandPE = false;
2266
2267
2268
2269
2270 char ourBuf [] = readBuffer;
2271
2272
2273 try {
2274 c = readCh ();
2275 loop:
2276 while (! (c == delim && readBuffer == ourBuf)) {
2277 switch (c) {
2278
2279 case '\n':
2280 case '\r':
2281 case '\t':
2282 if ((flags & LIT_ATTRIBUTE) != 0)
2283 c = ' ';
2284 break;
2285 case '&':
2286 c = readCh ();
2287
2288
2289 if (c == '#') {
2290 if ((flags & LIT_DISABLE_CREF) != 0) {
2291 dataBufferAppend ('&');
2292 dataBufferAppend ('#');
2293 continue;
2294 }
2295 parseCharRef ();
2296
2297
2298 } else {
2299 unread (c);
2300
2301 if ((flags & LIT_ENTITY_REF) > 0) {
2302 parseEntityRef (false);
2303
2304
2305 } else if ((flags & LIT_DISABLE_EREF) != 0) {
2306 dataBufferAppend ('&');
2307
2308
2309 } else {
2310 String name = readNmtoken (true);
2311 require (';');
2312 if ((flags & LIT_ENTITY_CHECK) != 0
2313 && getEntityType (name) ==
2314 ENTITY_UNDECLARED) {
2315 error ("General entity '" + name
2316 + "' must be declared before use");
2317 }
2318 dataBufferAppend ('&');
2319 dataBufferAppend (name);
2320 dataBufferAppend (';');
2321 }
2322 }
2323 c = readCh ();
2324 continue loop;
2325
2326 case '<':
2327
2328
2329 if ((flags & LIT_ATTRIBUTE) != 0)
2330 error ("attribute values may not contain '<'");
2331 break;
2332
2333
2334
2335 default:
2336 break;
2337 }
2338 dataBufferAppend (c);
2339 c = readCh ();
2340 }
2341 } catch (EOFException e) {
2342 error ("end of input while looking for delimiter (started on line "
2343 + startLine + ')', null, new Character (delim).toString ());
2344 }
2345 inLiteral = false;
2346 expandPE = saved;
2347
2348
2349 if ((flags & LIT_NORMALIZE) > 0) {
2350 dataBufferNormalize ();
2351 }
2352
2353
2354 return dataBufferToString ();
2355 }
2356
2357
2358 /***
2359 * Try reading external identifiers.
2360 * A system identifier is not required for notations.
2361 * @param inNotation Are we in a notation?
2362 * @return A two-member String array containing the identifiers.
2363 */
2364 private String[] readExternalIds (boolean inNotation)
2365 throws Exception
2366 {
2367 char c;
2368 String ids[] = new String [2];
2369 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
2370
2371 if (tryRead ("PUBLIC")) {
2372 requireWhitespace ();
2373 ids [0] = readLiteral (LIT_NORMALIZE | flags);
2374 if (inNotation) {
2375 skipWhitespace ();
2376 c = readCh ();
2377 unread (c);
2378 if (c == '"' || c == '\'') {
2379 ids [1] = readLiteral (flags);
2380 }
2381 } else {
2382 requireWhitespace ();
2383 ids [1] = readLiteral (flags);
2384 }
2385
2386 for (int i = 0; i < ids [0].length (); i++) {
2387 c = ids [0].charAt (i);
2388 if (c >= 'a' && c <= 'z')
2389 continue;
2390 if (c >= 'A' && c <= 'Z')
2391 continue;
2392 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1)
2393 continue;
2394 error ("illegal PUBLIC id character U+"
2395 + Integer.toHexString (c));
2396 }
2397 } else if (tryRead ("SYSTEM")) {
2398 requireWhitespace ();
2399 ids [1] = readLiteral (flags);
2400 }
2401
2402
2403
2404
2405
2406 return ids;
2407 }
2408
2409
2410 /***
2411 * Test if a character is whitespace.
2412 * <pre>
2413 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2414 * </pre>
2415 * @param c The character to test.
2416 * @return true if the character is whitespace.
2417 */
2418 private final boolean isWhitespace (char c)
2419 {
2420 if (c > 0x20)
2421 return false;
2422 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
2423 return true;
2424 return false;
2425 }
2426
2427
2428
2429
2430
2431
2432
2433 /***
2434 * Add a character to the data buffer.
2435 */
2436 private void dataBufferAppend (char c)
2437 {
2438
2439 if (dataBufferPos >= dataBuffer.length)
2440 dataBuffer =
2441 (char[]) extendArray (dataBuffer,
2442 dataBuffer.length, dataBufferPos);
2443 dataBuffer [dataBufferPos++] = c;
2444 }
2445
2446
2447 /***
2448 * Add a string to the data buffer.
2449 */
2450 private void dataBufferAppend (String s)
2451 {
2452 dataBufferAppend (s.toCharArray (), 0, s.length ());
2453 }
2454
2455
2456 /***
2457 * Append (part of) a character array to the data buffer.
2458 */
2459 private void dataBufferAppend (char ch[], int start, int length)
2460 {
2461 dataBuffer = (char[])
2462 extendArray (dataBuffer, dataBuffer.length,
2463 dataBufferPos + length);
2464
2465 System.arraycopy (ch, start, dataBuffer, dataBufferPos, length);
2466 dataBufferPos += length;
2467 }
2468
2469
2470 /***
2471 * Normalise whitespace in the data buffer.
2472 */
2473 private void dataBufferNormalize ()
2474 {
2475 int i = 0;
2476 int j = 0;
2477 int end = dataBufferPos;
2478
2479
2480 while (j < end && isWhitespace (dataBuffer [j])) {
2481 j++;
2482 }
2483
2484
2485 while (end > j && isWhitespace (dataBuffer [end - 1])) {
2486 end --;
2487 }
2488
2489
2490 while (j < end) {
2491
2492 char c = dataBuffer [j++];
2493
2494
2495
2496 if (isWhitespace (c)) {
2497 while (j < end && isWhitespace (dataBuffer [j++])) {}
2498
2499 dataBuffer [i++] = ' ';
2500 dataBuffer [i++] = dataBuffer [j - 1];
2501 } else {
2502 dataBuffer [i++] = c;
2503 }
2504 }
2505
2506
2507 dataBufferPos = i;
2508 }
2509
2510
2511 /***
2512 * Convert the data buffer to a string.
2513 */
2514 private String dataBufferToString ()
2515 {
2516 String s = new String (dataBuffer, 0, dataBufferPos);
2517 dataBufferPos = 0;
2518 return s;
2519 }
2520
2521
2522 /***
2523 * Flush the contents of the data buffer to the handler, as
2524 * appropriate, and reset the buffer for new input.
2525 */
2526 private void dataBufferFlush ()
2527 throws SAXException
2528 {
2529 if (currentElementContent == CONTENT_ELEMENTS
2530 && dataBufferPos > 0
2531 && !inCDATA
2532 ) {
2533
2534
2535 for (int i = 0; i < dataBufferPos; i++) {
2536 if (!isWhitespace (dataBuffer [i])) {
2537 handler.charData (dataBuffer, 0, dataBufferPos);
2538 dataBufferPos = 0;
2539 }
2540 }
2541 if (dataBufferPos > 0) {
2542 handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos);
2543 dataBufferPos = 0;
2544 }
2545 } else if (dataBufferPos > 0) {
2546 handler.charData (dataBuffer, 0, dataBufferPos);
2547 dataBufferPos = 0;
2548 }
2549 }
2550
2551
2552 /***
2553 * Require a string to appear, or throw an exception.
2554 * <p><em>Precondition:</em> Entity expansion is not required.
2555 * <p><em>Precondition:</em> data buffer has no characters that
2556 * will get sent to the application.
2557 */
2558 private void require (String delim)
2559 throws SAXException, IOException
2560 {
2561 int length = delim.length ();
2562 char ch [];
2563
2564 if (length < dataBuffer.length) {
2565 ch = dataBuffer;
2566 delim.getChars (0, length, ch, 0);
2567 } else
2568 ch = delim.toCharArray ();
2569
2570 if (USE_CHEATS
2571 && length <= (readBufferLength - readBufferPos)) {
2572 int offset = readBufferPos;
2573
2574 for (int i = 0; i < length; i++, offset++)
2575 if (ch [i] != readBuffer [offset])
2576 error ("required string", null, delim);
2577 readBufferPos = offset;
2578
2579 } else {
2580 for (int i = 0; i < length; i++)
2581 require (ch [i]);
2582 }
2583 }
2584
2585
2586 /***
2587 * Require a character to appear, or throw an exception.
2588 */
2589 private void require (char delim)
2590 throws SAXException, IOException
2591 {
2592 char c = readCh ();
2593
2594 if (c != delim) {
2595 error ("required character", c, new Character (delim).toString ());
2596 }
2597 }
2598
2599
2600 /***
2601 * Create an interned string from a character array.
2602 * Ælfred uses this method to create an interned version
2603 * of all names and name tokens, so that it can test equality
2604 * with <code>==</code> instead of <code>String.equals ()</code>.
2605 *
2606 * <p>This is much more efficient than constructing a non-interned
2607 * string first, and then interning it.
2608 *
2609 * @param ch an array of characters for building the string.
2610 * @param start the starting position in the array.
2611 * @param length the number of characters to place in the string.
2612 * @return an interned string.
2613 * @see #intern (String)
2614 * @see java.lang.String#intern
2615 */
2616 public String intern (char ch[], int start, int length)
2617 {
2618 int index = 0;
2619 int hash = 0;
2620 Object bucket [];
2621
2622
2623 for (int i = start; i < start + length; i++)
2624 hash = 31 * hash + ch [i];
2625 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
2626
2627
2628 if ((bucket = symbolTable [hash]) == null) {
2629
2630 bucket = new Object [8];
2631
2632
2633
2634 } else {
2635 while (index < bucket.length) {
2636 char chFound [] = (char []) bucket [index];
2637
2638
2639 if (chFound == null)
2640 break;
2641
2642
2643 if (chFound.length == length) {
2644 for (int i = 0; i < chFound.length; i++) {
2645
2646 if (ch [start + i] != chFound [i]) {
2647 break;
2648 } else if (i == length - 1) {
2649
2650 return (String) bucket [index + 1];
2651 }
2652 }
2653 }
2654 index += 2;
2655 }
2656
2657
2658
2659 bucket = (Object []) extendArray (bucket, bucket.length, index);
2660 }
2661 symbolTable [hash] = bucket;
2662
2663
2664
2665 String s = new String (ch, start, length).intern ();
2666 bucket [index] = s.toCharArray ();
2667 bucket [index + 1] = s;
2668 return s;
2669 }
2670
2671
2672 /***
2673 * Ensure the capacity of an array, allocating a new one if
2674 * necessary. Usually called only a handful of times.
2675 */
2676 private Object extendArray (Object array, int currentSize, int requiredSize)
2677 {
2678 if (requiredSize < currentSize) {
2679 return array;
2680 } else {
2681 Object newArray = null;
2682 int newSize = currentSize * 2;
2683
2684 if (newSize <= requiredSize)
2685 newSize = requiredSize + 1;
2686
2687 if (array instanceof char[])
2688 newArray = new char [newSize];
2689 else if (array instanceof Object[])
2690 newArray = new Object [newSize];
2691 else
2692 throw new RuntimeException ();
2693
2694 System.arraycopy (array, 0, newArray, 0, currentSize);
2695 return newArray;
2696 }
2697 }
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709 /***
2710 * Get the declared elements for an XML document.
2711 * <p>The results will be valid only after the DTD (if any) has been
2712 * parsed.
2713 * @return An enumeration of all element types declared for this
2714 * document (as Strings).
2715 * @see #getElementContentType
2716 * @see #getElementContentModel
2717 */
2718 public Iterator declaredElements ()
2719 {
2720 return elementInfo.keySet().iterator();
2721 }
2722
2723
2724 /***
2725 * Look up the content type of an element.
2726 * @param element element info vector
2727 * @param defaultType value for null vector
2728 * @return An integer constant representing the content type.
2729 * @see #CONTENT_UNDECLARED
2730 * @see #CONTENT_ANY
2731 * @see #CONTENT_EMPTY
2732 * @see #CONTENT_MIXED
2733 * @see #CONTENT_ELEMENTS
2734 */
2735 private int getContentType (Object element [], int defaultType)
2736 {
2737 if (element == null)
2738 return defaultType;
2739 else
2740 return ((Integer) element [0]).intValue ();
2741 }
2742
2743
2744 /***
2745 * Look up the content type of an element.
2746 * @param name The element type name.
2747 * @return An integer constant representing the content type.
2748 * @see #getElementContentModel
2749 * @see #CONTENT_UNDECLARED
2750 * @see #CONTENT_ANY
2751 * @see #CONTENT_EMPTY
2752 * @see #CONTENT_MIXED
2753 * @see #CONTENT_ELEMENTS
2754 */
2755 public int getElementContentType (String name)
2756 {
2757 Object element [] = (Object []) elementInfo.get (name);
2758 return getContentType (element, CONTENT_UNDECLARED);
2759 }
2760
2761
2762 /***
2763 * Look up the content model of an element.
2764 * <p>The result will always be null unless the content type is
2765 * CONTENT_ELEMENTS or CONTENT_MIXED.
2766 * @param name The element type name.
2767 * @return The normalised content model, as a string.
2768 * @see #getElementContentType
2769 */
2770 public String getElementContentModel (String name)
2771 {
2772 Object element[] = (Object[]) elementInfo.get (name);
2773 if (element == null) {
2774 return null;
2775 } else {
2776 return (String) element [1];
2777 }
2778 }
2779
2780
2781 /***
2782 * Register an element.
2783 * Array format:
2784 * element type
2785 * attribute hash table
2786 */
2787 private void setElement (String name, int contentType,
2788 String contentModel, HashMap attributes)
2789 throws Exception
2790 {
2791 Object element[];
2792
2793
2794 element = (Object[]) elementInfo.get (name);
2795
2796
2797 if (element == null) {
2798 element = new Object [3];
2799 element [0] = new Integer (CONTENT_UNDECLARED);
2800 element [1] = null;
2801 element [2] = null;
2802 } else if (contentType != CONTENT_UNDECLARED
2803 && ((Integer) element [0]).intValue () != CONTENT_UNDECLARED
2804 ) {
2805
2806 return;
2807 }
2808
2809
2810 if (contentType != CONTENT_UNDECLARED) {
2811 element [0] = new Integer (contentType);
2812 }
2813
2814
2815 if (contentModel != null) {
2816 element [1] = contentModel;
2817 }
2818
2819
2820 if (attributes != null) {
2821 element [2] = attributes;
2822 }
2823
2824
2825 elementInfo.put (name, element);
2826 }
2827
2828
2829 /***
2830 * Look up the attribute hash table for an element.
2831 * The hash table is the second item in the element array.
2832 */
2833 private HashMap getElementAttributes (String name)
2834 {
2835 Object element[] = (Object[]) elementInfo.get (name);
2836 if (element == null) {
2837 return null;
2838 } else {
2839 return (HashMap) element [2];
2840 }
2841 }
2842
2843
2844
2845
2846
2847
2848
2849 /***
2850 * Get the declared attributes for an element type.
2851 * @param elname The name of the element type.
2852 * @return An Iterator of all the attributes declared for
2853 * a specific element type. The results will be valid only
2854 * after the DTD (if any) has been parsed.
2855 * @see #getAttributeType
2856 * @see #getAttributeIterator
2857 * @see #getAttributeDefaultValueType
2858 * @see #getAttributeDefaultValue
2859 * @see #getAttributeExpandedValue
2860 */
2861 private Iterator declaredAttributes (Object element [])
2862 {
2863 HashMap attlist;
2864
2865 if (element == null)
2866 return null;
2867 if ((attlist = (HashMap) element [2]) == null)
2868 return null;
2869 return attlist.keySet().iterator();
2870 }
2871
2872 /***
2873 * Get the declared attributes for an element type.
2874 * @param elname The name of the element type.
2875 * @return An Iterator of all the attributes declared for
2876 * a specific element type. The results will be valid only
2877 * after the DTD (if any) has been parsed.
2878 * @see #getAttributeType
2879 * @see #getAttributeIterator
2880 * @see #getAttributeDefaultValueType
2881 * @see #getAttributeDefaultValue
2882 * @see #getAttributeExpandedValue
2883 */
2884 public Iterator declaredAttributes (String elname)
2885 {
2886 return declaredAttributes ((Object []) elementInfo.get (elname));
2887 }
2888
2889
2890 /***
2891 * Retrieve the declared type of an attribute.
2892 * @param name The name of the associated element.
2893 * @param aname The name of the attribute.
2894 * @return An integer constant representing the attribute type.
2895 * @see #ATTRIBUTE_UNDECLARED
2896 * @see #ATTRIBUTE_CDATA
2897 * @see #ATTRIBUTE_ID
2898 * @see #ATTRIBUTE_IDREF
2899 * @see #ATTRIBUTE_IDREFS
2900 * @see #ATTRIBUTE_ENTITY
2901 * @see #ATTRIBUTE_ENTITIES
2902 * @see #ATTRIBUTE_NMTOKEN
2903 * @see #ATTRIBUTE_NMTOKENS
2904 * @see #ATTRIBUTE_ENUMERATED
2905 * @see #ATTRIBUTE_NOTATION
2906 */
2907 public int getAttributeType (String name, String aname)
2908 {
2909 Object attribute[] = getAttribute (name, aname);
2910 if (attribute == null) {
2911 return ATTRIBUTE_UNDECLARED;
2912 } else {
2913 return ((Integer) attribute [0]).intValue ();
2914 }
2915 }
2916
2917
2918 /***
2919 * Retrieve the allowed values for an enumerated attribute type.
2920 * @param name The name of the associated element.
2921 * @param aname The name of the attribute.
2922 * @return A string containing the token list.
2923 * @see #ATTRIBUTE_ENUMERATED
2924 * @see #ATTRIBUTE_NOTATION
2925 */
2926 public String getAttributeIterator (String name, String aname)
2927 {
2928 Object attribute[] = getAttribute (name, aname);
2929 if (attribute == null) {
2930 return null;
2931 } else {
2932 return (String) attribute [3];
2933 }
2934 }
2935
2936
2937 /***
2938 * Retrieve the default value of a declared attribute.
2939 * @param name The name of the associated element.
2940 * @param aname The name of the attribute.
2941 * @return The default value, or null if the attribute was
2942 * #IMPLIED or simply undeclared and unspecified.
2943 * @see #getAttributeExpandedValue
2944 */
2945 public String getAttributeDefaultValue (String name, String aname)
2946 {
2947 Object attribute[] = getAttribute (name, aname);
2948 if (attribute == null) {
2949 return null;
2950 } else {
2951 return (String) attribute [1];
2952 }
2953 }
2954
2955
2956 /***
2957 * Retrieve the expanded value of a declared attribute.
2958 * <p>General entities will be expanded (once).
2959 * @param name The name of the associated element.
2960 * @param aname The name of the attribute.
2961 * @return The expanded default value, or null if the attribute was
2962 * #IMPLIED or simply undeclared
2963 * @see #getAttributeDefaultValue
2964 */
2965 public String getAttributeExpandedValue (String name, String aname)
2966 throws Exception
2967 {
2968 Object attribute[] = getAttribute (name, aname);
2969
2970 if (attribute == null) {
2971 return null;
2972 } else if (attribute [4] == null && attribute [1] != null) {
2973
2974
2975 char buf [] = new char [1];
2976 int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
2977 int type = getAttributeType (name, aname);
2978
2979 if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED)
2980 flags |= LIT_NORMALIZE;
2981 buf [0] = '"';
2982 pushCharArray (null, buf, 0, 1);
2983 pushString (null, (String) attribute [1]);
2984 pushCharArray (null, buf, 0, 1);
2985 attribute [4] = readLiteral (flags);
2986 }
2987 return (String) attribute [4];
2988 }
2989
2990
2991 /***
2992 * Retrieve the default value type of a declared attribute.
2993 * @see #ATTRIBUTE_DEFAULT_SPECIFIED
2994 * @see #ATTRIBUTE_DEFAULT_IMPLIED
2995 * @see #ATTRIBUTE_DEFAULT_REQUIRED
2996 * @see #ATTRIBUTE_DEFAULT_FIXED
2997 */
2998 public int getAttributeDefaultValueType (String name, String aname)
2999 {
3000 Object attribute[] = getAttribute (name, aname);
3001 if (attribute == null) {
3002 return ATTRIBUTE_DEFAULT_UNDECLARED;
3003 } else {
3004 return ((Integer) attribute [2]).intValue ();
3005 }
3006 }
3007
3008
3009 /***
3010 * Register an attribute declaration for later retrieval.
3011 * Format:
3012 * - String type
3013 * - String default value
3014 * - int value type
3015 */
3016 private void setAttribute (String elName, String name, int type,
3017 String enumeration,
3018 String value, int valueType)
3019 throws Exception
3020 {
3021 HashMap attlist;
3022 Object attribute[];
3023
3024
3025 attlist = getElementAttributes (elName);
3026 if (attlist == null) {
3027 attlist = new HashMap ();
3028 }
3029
3030
3031 if (attlist.get (name) != null) {
3032 return;
3033 } else {
3034 attribute = new Object [5];
3035 attribute [0] = new Integer (type);
3036 attribute [1] = value;
3037 attribute [2] = new Integer (valueType);
3038 attribute [3] = enumeration;
3039 attribute [4] = null;
3040 attlist.put (name, attribute);
3041
3042
3043
3044 setElement (elName, CONTENT_UNDECLARED, null, attlist);
3045 }
3046 }
3047
3048
3049 /***
3050 * Retrieve the three-member array representing an
3051 * attribute declaration.
3052 */
3053 private Object[] getAttribute (String elName, String name)
3054 {
3055 HashMap attlist;
3056 Object attribute[];
3057
3058 attlist = getElementAttributes (elName);
3059 if (attlist == null) {
3060 return null;
3061 }
3062
3063 attribute = (Object[]) attlist.get (name);
3064 return attribute;
3065 }
3066
3067
3068
3069
3070
3071
3072 /***
3073 * Get declared entities.
3074 * @return An Iterator of all the entities declared for
3075 * this XML document. The results will be valid only
3076 * after the DTD (if any) has been parsed.
3077 * @see #getEntityType
3078 * @see #getEntityPublicId
3079 * @see #getEntitySystemId
3080 * @see #getEntityValue
3081 * @see #getEntityNotationName
3082 */
3083 public Iterator declaredEntities ()
3084 {
3085 return entityInfo.keySet().iterator();
3086 }
3087
3088
3089 /***
3090 * Find the type of an entity.
3091 * @returns An integer constant representing the entity type.
3092 * @see #ENTITY_UNDECLARED
3093 * @see #ENTITY_INTERNAL
3094 * @see #ENTITY_NDATA
3095 * @see #ENTITY_TEXT
3096 */
3097 public int getEntityType (String ename)
3098 {
3099 Object entity[] = (Object[]) entityInfo.get (ename);
3100 if (entity == null) {
3101 return ENTITY_UNDECLARED;
3102 } else {
3103 return ((Integer) entity [0]).intValue ();
3104 }
3105 }
3106
3107
3108 /***
3109 * Return an external entity's public identifier, if any.
3110 * @param ename The name of the external entity.
3111 * @return The entity's system identifier, or null if the
3112 * entity was not declared, if it is not an
3113 * external entity, or if no public identifier was
3114 * provided.
3115 * @see #getEntityType
3116 */
3117 public String getEntityPublicId (String ename)
3118 {
3119 Object entity[] = (Object[]) entityInfo.get (ename);
3120 if (entity == null) {
3121 return null;
3122 } else {
3123 return (String) entity [1];
3124 }
3125 }
3126
3127
3128 /***
3129 * Return an external entity's system identifier.
3130 * @param ename The name of the external entity.
3131 * @return The entity's system identifier, or null if the
3132 * entity was not declared, or if it is not an
3133 * external entity.
3134 * @see #getEntityType
3135 */
3136 public String getEntitySystemId (String ename)
3137 {
3138 Object entity[] = (Object[]) entityInfo.get (ename);
3139 if (entity == null) {
3140 return null;
3141 } else {
3142 return (String) entity [2];
3143 }
3144 }
3145
3146
3147 /***
3148 * Return the value of an internal entity.
3149 * @param ename The name of the internal entity.
3150 * @return The entity's value, or null if the entity was
3151 * not declared, or if it is not an internal entity.
3152 * @see #getEntityType
3153 */
3154 public String getEntityValue (String ename)
3155 {
3156 Object entity[] = (Object[]) entityInfo.get (ename);
3157 if (entity == null) {
3158 return null;
3159 } else {
3160 return (String) entity [3];
3161 }
3162 }
3163
3164
3165 /***
3166 * Get the notation name associated with an NDATA entity.
3167 * @param ename The NDATA entity name.
3168 * @return The associated notation name, or null if the
3169 * entity was not declared, or if it is not an
3170 * NDATA entity.
3171 * @see #getEntityType
3172 */
3173 public String getEntityNotationName (String eName)
3174 {
3175 Object entity[] = (Object[]) entityInfo.get (eName);
3176 if (entity == null) {
3177 return null;
3178 } else {
3179 return (String) entity [4];
3180 }
3181 }
3182
3183
3184 /***
3185 * Register an entity declaration for later retrieval.
3186 */
3187 private void setInternalEntity (String eName, String value)
3188 {
3189 setEntity (eName, ENTITY_INTERNAL, null, null, value, null);
3190 }
3191
3192
3193 /***
3194 * Register an external data entity.
3195 */
3196 private void setExternalDataEntity (String eName, String pubid,
3197 String sysid, String nName)
3198 {
3199 setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName);
3200 }
3201
3202
3203 /***
3204 * Register an external text entity.
3205 */
3206 private void setExternalTextEntity (String eName,
3207 String pubid, String sysid)
3208 {
3209 setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null);
3210 }
3211
3212
3213 /***
3214 * Register an entity declaration for later retrieval.
3215 */
3216 private void setEntity (String eName, int eClass,
3217 String pubid, String sysid,
3218 String value, String nName)
3219 {
3220 Object entity[];
3221
3222 if (entityInfo.get (eName) == null) {
3223 entity = new Object [5];
3224 entity [0] = new Integer (eClass);
3225 entity [1] = pubid;
3226 entity [2] = sysid;
3227 entity [3] = value;
3228 entity [4] = nName;
3229
3230 entityInfo.put (eName, entity);
3231 }
3232 }
3233
3234
3235
3236
3237
3238
3239 /***
3240 * Get declared notations.
3241 * @return An Iterator of all the notations declared for
3242 * this XML document. The results will be valid only
3243 * after the DTD (if any) has been parsed.
3244 * @see #getNotationPublicId
3245 * @see #getNotationSystemId
3246 */
3247 public Iterator declaredNotations ()
3248 {
3249 return notationInfo.keySet().iterator();
3250 }
3251
3252
3253 /***
3254 * Look up the public identifier for a notation.
3255 * You will normally use this method to look up a notation
3256 * that was provided as an attribute value or for an NDATA entity.
3257 * @param nname The name of the notation.
3258 * @return A string containing the public identifier, or null
3259 * if none was provided or if no such notation was
3260 * declared.
3261 * @see #getNotationSystemId
3262 */
3263 public String getNotationPublicId (String nname)
3264 {
3265 Object notation[] = (Object[]) notationInfo.get (nname);
3266 if (notation == null) {
3267 return null;
3268 } else {
3269 return (String) notation [0];
3270 }
3271 }
3272
3273
3274 /***
3275 * Look up the system identifier for a notation.
3276 * You will normally use this method to look up a notation
3277 * that was provided as an attribute value or for an NDATA entity.
3278 * @param nname The name of the notation.
3279 * @return A string containing the system identifier, or null
3280 * if no such notation was declared.
3281 * @see #getNotationPublicId
3282 */
3283 public String getNotationSystemId (String nname)
3284 {
3285 Object notation[] = (Object[]) notationInfo.get (nname);
3286 if (notation == null) {
3287 return null;
3288 } else {
3289 return (String) notation [1];
3290 }
3291 }
3292
3293
3294 /***
3295 * Register a notation declaration for later retrieval.
3296 * Format:
3297 * - public id
3298 * - system id
3299 */
3300 private void setNotation (String nname, String pubid, String sysid)
3301 throws Exception
3302 {
3303 Object notation[];
3304
3305 if (notationInfo.get (nname) == null) {
3306 notation = new Object [2];
3307 notation [0] = pubid;
3308 notation [1] = sysid;
3309 notationInfo.put (nname, notation);
3310 } else {
3311
3312
3313 }
3314 }
3315
3316
3317
3318
3319
3320
3321
3322 /***
3323 * Return the current line number.
3324 */
3325 public int getLineNumber ()
3326 {
3327 return line;
3328 }
3329
3330
3331 /***
3332 * Return the current column number.
3333 */
3334 public int getColumnNumber ()
3335 {
3336 return column;
3337 }
3338
3339
3340
3341
3342
3343
3344
3345 /***
3346 * Read a single character from the readBuffer.
3347 * <p>The readDataChunk () method maintains the buffer.
3348 * <p>If we hit the end of an entity, try to pop the stack and
3349 * keep going.
3350 * <p> (This approach doesn't really enforce XML's rules about
3351 * entity boundaries, but this is not currently a validating
3352 * parser).
3353 * <p>This routine also attempts to keep track of the current
3354 * position in external entities, but it's not entirely accurate.
3355 * @return The next available input character.
3356 * @see #unread (char)
3357 * @see #unread (String)
3358 * @see #readDataChunk
3359 * @see #readBuffer
3360 * @see #line
3361 * @return The next character from the current input source.
3362 */
3363 private char readCh ()
3364 throws SAXException, IOException
3365 {
3366 char c;
3367
3368
3369
3370
3371
3372 while (readBufferPos >= readBufferLength) {
3373 switch (sourceType) {
3374 case INPUT_READER:
3375 case INPUT_EXTERNAL:
3376 case INPUT_STREAM:
3377 readDataChunk ();
3378 while (readBufferLength < 1) {
3379 popInput ();
3380 if (readBufferLength < 1) {
3381 readDataChunk ();
3382 }
3383 }
3384 break;
3385
3386 default:
3387
3388 popInput ();
3389 break;
3390 }
3391 }
3392
3393 c = readBuffer [readBufferPos++];
3394
3395 if (c == '\n') {
3396 line++;
3397 column = 0;
3398 } else {
3399 if (c == '<')
3400
3401 else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3402 error ("illegal XML character U+"
3403 + Integer.toHexString (c));
3404
3405
3406
3407
3408
3409 else if (c == '%' && expandPE) {
3410 if (peIsError)
3411 error ("PE reference within decl in internal subset.");
3412 parsePEReference ();
3413 return readCh ();
3414 }
3415 column++;
3416 }
3417
3418 return c;
3419 }
3420
3421
3422 /***
3423 * Push a single character back onto the current input stream.
3424 * <p>This method usually pushes the character back onto
3425 * the readBuffer, while the unread (String) method treats the
3426 * string as a new internal entity.
3427 * <p>I don't think that this would ever be called with
3428 * readBufferPos = 0, because the methods always reads a character
3429 * before unreading it, but just in case, I've added a boundary
3430 * condition.
3431 * @param c The character to push back.
3432 * @see #readCh
3433 * @see #unread (String)
3434 * @see #unread (char[])
3435 * @see #readBuffer
3436 */
3437 private void unread (char c)
3438 throws SAXException
3439 {
3440
3441 if (c == '\n') {
3442 line--;
3443 column = -1;
3444 }
3445 if (readBufferPos > 0) {
3446 readBuffer [--readBufferPos] = c;
3447 } else {
3448 pushString (null, new Character (c).toString ());
3449 }
3450 }
3451
3452
3453 /***
3454 * Push a char array back onto the current input stream.
3455 * <p>NOTE: you must <em>never</em> push back characters that you
3456 * haven't actually read: use pushString () instead.
3457 * @see #readCh
3458 * @see #unread (char)
3459 * @see #unread (String)
3460 * @see #readBuffer
3461 * @see #pushString
3462 */
3463 private void unread (char ch[], int length)
3464 throws SAXException
3465 {
3466 for (int i = 0; i < length; i++) {
3467 if (ch [i] == '\n') {
3468 line--;
3469 column = -1;
3470 }
3471 }
3472 if (length < readBufferPos) {
3473 readBufferPos -= length;
3474 } else {
3475 pushCharArray (null, ch, 0, length);
3476 sourceType = INPUT_BUFFER;
3477 }
3478 }
3479
3480
3481 /***
3482 * Push a new external input source.
3483 * The source will be some kind of parsed entity, such as a PE
3484 * (including the external DTD subset) or content for the body.
3485 * <p>TODO: Right now, this method always attempts to autodetect
3486 * the encoding; in the future, it should allow the caller to
3487 * request an encoding explicitly, and it should also look at the
3488 * headers with an HTTP connection.
3489 * @param url The java.net.URL object for the entity.
3490 * @see SAXDriver#resolveEntity
3491 * @see #pushString
3492 * @see #sourceType
3493 * @see #pushInput
3494 * @see #detectEncoding
3495 * @see #sourceType
3496 * @see #readBuffer
3497 */
3498 private void pushURL (
3499 String ename,
3500 String publicId,
3501 String systemId,
3502 Reader reader,
3503 InputStream stream,
3504 String encoding
3505 ) throws SAXException, IOException
3506 {
3507 URL url;
3508 boolean ignoreEncoding = false;
3509
3510
3511 pushInput (ename);
3512
3513
3514
3515 readBuffer = new char [READ_BUFFER_MAX + 4];
3516 readBufferPos = 0;
3517 readBufferLength = 0;
3518 readBufferOverflow = -1;
3519 is = null;
3520 line = 1;
3521
3522 currentByteCount = 0;
3523
3524
3525
3526
3527
3528 if (systemId != null && externalEntity != null) {
3529 systemId = new URL (externalEntity.getURL (), systemId).toString ();
3530 } else if (baseURI != null) {
3531 systemId = new URL (new URL (baseURI), systemId).toString ();
3532
3533 }
3534
3535
3536
3537
3538 if (reader == null && stream == null && systemId != null) {
3539 Object input = handler.resolveEntity (publicId, systemId);
3540 if (input != null) {
3541 if (input instanceof String) {
3542 systemId = (String) input;
3543 } else if (input instanceof InputStream) {
3544 stream = (InputStream) input;
3545 } else if (input instanceof Reader) {
3546 reader = (Reader) input;
3547 }
3548 }
3549 }
3550
3551
3552 if (systemId != null) {
3553 handler.startExternalEntity (systemId);
3554 } else {
3555 handler.startExternalEntity ("[unidentified data stream]");
3556 }
3557
3558
3559
3560 if (reader != null) {
3561 sourceType = INPUT_READER;
3562 this.reader = reader;
3563 tryEncodingDecl (true);
3564 return;
3565 }
3566
3567
3568
3569 if (stream != null) {
3570 sourceType = INPUT_STREAM;
3571 is = stream;
3572 url = null;
3573 } else {
3574
3575
3576
3577 sourceType = INPUT_EXTERNAL;
3578 url = new URL (systemId);
3579
3580 externalEntity = url.openConnection ();
3581 externalEntity.connect ();
3582 is = externalEntity.getInputStream ();
3583 }
3584
3585
3586
3587 if (!is.markSupported ()) {
3588 is = new BufferedInputStream (is);
3589 }
3590
3591
3592 if (encoding == null && externalEntity != null) {
3593
3594
3595
3596 if (!"file".equals (externalEntity.getURL ().getProtocol ())) {
3597 int temp;
3598
3599
3600
3601 encoding = externalEntity.getContentType ();
3602
3603
3604 if (encoding==null) {
3605 temp = -1;
3606 } else {
3607 temp = encoding.indexOf ("charset");
3608 }
3609
3610
3611
3612
3613 if (temp < 0)
3614 encoding = null;
3615 else {
3616 temp = encoding.indexOf ('=', temp + 7);
3617 encoding = encoding.substring (temp + 1);
3618 if ((temp = encoding.indexOf (';')) > 0)
3619 encoding = encoding.substring (0, temp);
3620
3621
3622 if ((temp = encoding.indexOf ('(')) > 0)
3623 encoding = encoding.substring (0, temp);
3624
3625 if ((temp = encoding.indexOf ('"')) > 0)
3626 encoding = encoding.substring (temp + 1,
3627 encoding.indexOf ('"', temp + 2));
3628 encoding.trim ();
3629 }
3630 }
3631 }
3632
3633
3634 if (encoding != null) {
3635 this.encoding = ENCODING_EXTERNAL;
3636 setupDecoding (encoding);
3637 ignoreEncoding = true;
3638
3639
3640 } else {
3641 detectEncoding ();
3642 ignoreEncoding = false;
3643 }
3644
3645
3646 tryEncodingDecl (ignoreEncoding);
3647 }
3648
3649
3650 /***
3651 * Check for an encoding declaration. This is the second part of the
3652 * XML encoding autodetection algorithm, relying on detectEncoding to
3653 * get to the point that this part can read any encoding declaration
3654 * in the document (using only US-ASCII characters).
3655 *
3656 * <p> Because this part starts to fill parser buffers with this data,
3657 * it's tricky to to a reader so that Java's built-in decoders can be
3658 * used for the character encodings that aren't built in to this parser
3659 * (such as EUC-JP, KOI8-R, Big5, etc).
3660 *
3661 * @return any encoding in the declaration, uppercased; or null
3662 * @see detectEncoding
3663 */
3664 private String tryEncodingDecl (boolean ignoreEncoding)
3665 throws SAXException, IOException
3666 {
3667
3668 if (tryRead ("<?xml")) {
3669 dataBufferFlush ();
3670 if (tryWhitespace ()) {
3671 if (inputStack.size () > 0) {
3672 return parseTextDecl (ignoreEncoding);
3673 } else {
3674 return parseXMLDecl (ignoreEncoding);
3675 }
3676 } else {
3677 unread ("xml".toCharArray (), 3);
3678 parsePI ();
3679 }
3680 }
3681 return null;
3682 }
3683
3684
3685 /***
3686 * Attempt to detect the encoding of an entity.
3687 * <p>The trick here (as suggested in the XML standard) is that
3688 * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
3689 * <b>must</b> begin with an XML declaration or an encoding
3690 * declaration; we simply have to look for "<?xml" in various
3691 * encodings.
3692 * <p>This method has no way to distinguish among 8-bit encodings.
3693 * Instead, it sets up for UTF-8, then (possibly) revises its assumption
3694 * later in setupDecoding (). Any ASCII-derived 8-bit encoding
3695 * should work, but most will be rejected later by setupDecoding ().
3696 * <p>I don't currently detect EBCDIC, since I'm concerned that it
3697 * could also be a valid UTF-8 sequence; I'll have to do more checking
3698 * later.
3699 * @see #tryEncoding (byte[], byte, byte, byte, byte)
3700 * @see #tryEncoding (byte[], byte, byte)
3701 * @see #setupDecoding
3702 * @see #read8bitEncodingDeclaration
3703 */
3704 private void detectEncoding ()
3705 throws SAXException, IOException
3706 {
3707 byte signature[] = new byte [4];
3708
3709
3710
3711 is.mark (4);
3712 is.read (signature);
3713 is.reset ();
3714
3715
3716
3717
3718 if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3719 (byte) 0x00, (byte) 0x3c)) {
3720
3721
3722 encoding = ENCODING_UCS_4_1234;
3723
3724 } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3725 (byte) 0x00, (byte) 0x00)) {
3726
3727 encoding = ENCODING_UCS_4_4321;
3728
3729 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3730 (byte) 0x3c, (byte) 0x00)) {
3731
3732 encoding = ENCODING_UCS_4_2143;
3733
3734 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3735 (byte) 0x00, (byte) 0x00)) {
3736
3737 encoding = ENCODING_UCS_4_3412;
3738
3739
3740
3741 }
3742
3743
3744
3745
3746
3747
3748
3749 else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) {
3750
3751
3752 encoding = ENCODING_UCS_2_12;
3753 is.read (); is.read ();
3754
3755 } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) {
3756
3757
3758 encoding = ENCODING_UCS_2_21;
3759 is.read (); is.read ();
3760
3761 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3762 (byte) 0x00, (byte) 0x3f)) {
3763
3764
3765 encoding = ENCODING_UCS_2_12;
3766 error ("no byte-order mark for UCS-2 entity");
3767
3768 } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3769 (byte) 0x3f, (byte) 0x00)) {
3770
3771
3772 encoding = ENCODING_UCS_2_21;
3773 error ("no byte-order mark for UCS-2 entity");
3774 }
3775
3776
3777
3778
3779 else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f,
3780 (byte) 0x78, (byte) 0x6d)) {
3781
3782
3783 encoding = ENCODING_UTF_8;
3784 read8bitEncodingDeclaration ();
3785
3786 } else {
3787
3788
3789
3790
3791 encoding = ENCODING_UTF_8;
3792 }
3793 }
3794
3795
3796 /***
3797 * Check for a four-byte signature.
3798 * <p>Utility routine for detectEncoding ().
3799 * <p>Always looks for some part of "<?XML" in a specific encoding.
3800 * @param sig The first four bytes read.
3801 * @param b1 The first byte of the signature
3802 * @param b2 The second byte of the signature
3803 * @param b3 The third byte of the signature
3804 * @param b4 The fourth byte of the signature
3805 * @see #detectEncoding
3806 */
3807 private static boolean tryEncoding (
3808 byte sig[], byte b1, byte b2, byte b3, byte b4)
3809 {
3810 return (sig [0] == b1 && sig [1] == b2
3811 && sig [2] == b3 && sig [3] == b4);
3812 }
3813
3814
3815 /***
3816 * Check for a two-byte signature.
3817 * <p>Looks for a UCS-2 byte-order mark.
3818 * <p>Utility routine for detectEncoding ().
3819 * @param sig The first four bytes read.
3820 * @param b1 The first byte of the signature
3821 * @param b2 The second byte of the signature
3822 * @see #detectEncoding
3823 */
3824 private static boolean tryEncoding (byte sig[], byte b1, byte b2)
3825 {
3826 return ((sig [0] == b1) && (sig [1] == b2));
3827 }
3828
3829
3830 /***
3831 * This method pushes a string back onto input.
3832 * <p>It is useful either as the expansion of an internal entity,
3833 * or for backtracking during the parse.
3834 * <p>Call pushCharArray () to do the actual work.
3835 * @param s The string to push back onto input.
3836 * @see #pushCharArray
3837 */
3838 private void pushString (String ename, String s)
3839 throws SAXException
3840 {
3841 char ch[] = s.toCharArray ();
3842 pushCharArray (ename, ch, 0, ch.length);
3843 }
3844
3845
3846 /***
3847 * Push a new internal input source.
3848 * <p>This method is useful for expanding an internal entity,
3849 * or for unreading a string of characters. It creates a new
3850 * readBuffer containing the characters in the array, instead
3851 * of characters converted from an input byte stream.
3852 * @param ch The char array to push.
3853 * @see #pushString
3854 * @see #pushURL
3855 * @see #readBuffer
3856 * @see #sourceType
3857 * @see #pushInput
3858 */
3859 private void pushCharArray (String ename, char ch[], int start, int length)
3860 throws SAXException
3861 {
3862
3863 pushInput (ename);
3864 sourceType = INPUT_INTERNAL;
3865 readBuffer = ch;
3866 readBufferPos = start;
3867 readBufferLength = length;
3868 readBufferOverflow = -1;
3869 }
3870
3871
3872 /***
3873 * Save the current input source onto the stack.
3874 * <p>This method saves all of the global variables associated with
3875 * the current input source, so that they can be restored when a new
3876 * input source has finished. It also tests for entity recursion.
3877 * <p>The method saves the following global variables onto a stack
3878 * using a fixed-length array:
3879 * <ol>
3880 * <li>sourceType
3881 * <li>externalEntity
3882 * <li>readBuffer
3883 * <li>readBufferPos
3884 * <li>readBufferLength
3885 * <li>line
3886 * <li>encoding
3887 * </ol>
3888 * @param ename The name of the entity (if any) causing the new input.
3889 * @see #popInput
3890 * @see #sourceType
3891 * @see #externalEntity
3892 * @see #readBuffer
3893 * @see #readBufferPos
3894 * @see #readBufferLength
3895 * @see #line
3896 * @see #encoding
3897 */
3898 private void pushInput (String ename)
3899 throws SAXException
3900 {
3901 Object input[] = new Object [12];
3902
3903
3904 if (ename != null) {
3905 Iterator entities = entityStack.iterator ();
3906 while (entities.hasNext ()) {
3907 String e = (String) entities.next ();
3908 if (e == ename) {
3909 error ("recursive reference to entity", ename, null);
3910 }
3911 }
3912 }
3913 entityStack.add (ename);
3914
3915
3916 if (sourceType == INPUT_NONE) {
3917 return;
3918 }
3919
3920
3921
3922 input [0] = new Integer (sourceType);
3923 input [1] = externalEntity;
3924 input [2] = readBuffer;
3925 input [3] = new Integer (readBufferPos);
3926 input [4] = new Integer (readBufferLength);
3927 input [5] = new Integer (line);
3928 input [6] = new Integer (encoding);
3929 input [7] = new Integer (readBufferOverflow);
3930 input [8] = is;
3931 input [9] = new Integer (currentByteCount);
3932 input [10] = new Integer (column);
3933 input [11] = reader;
3934
3935
3936 inputStack.add (input);
3937 }
3938
3939
3940 /***
3941 * Restore a previous input source.
3942 * <p>This method restores all of the global variables associated with
3943 * the current input source.
3944 * @exception java.io.EOFException
3945 * If there are no more entries on the input stack.
3946 * @see #pushInput
3947 * @see #sourceType
3948 * @see #externalEntity
3949 * @see #readBuffer
3950 * @see #readBufferPos
3951 * @see #readBufferLength
3952 * @see #line
3953 * @see #encoding
3954 */
3955 private void popInput ()
3956 throws SAXException, IOException
3957 {
3958 Object input[];
3959
3960
3961 switch (sourceType) {
3962
3963 case INPUT_EXTERNAL:
3964 if (externalEntity != null) {
3965 handler.endExternalEntity (
3966 externalEntity.getURL ().toString ());
3967 }
3968 break;
3969 case INPUT_STREAM:
3970 if (baseURI != null) {
3971 handler.endExternalEntity (baseURI);
3972 }
3973 is.close ();
3974 break;
3975 case INPUT_READER:
3976 if (baseURI != null) {
3977 handler.endExternalEntity (baseURI);
3978 }
3979 reader.close ();
3980 break;
3981 }
3982
3983
3984
3985 if (inputStack.isEmpty ()) {
3986 throw new EOFException ("no more input");
3987 } else {
3988 String s;
3989 input = (Object[]) inputStack.remove ( inputStack.size() - 1 );
3990 s = (String) entityStack.remove ( entityStack.size() - 1 );
3991 }
3992
3993 sourceType = ((Integer) input [0]).intValue ();
3994 externalEntity = (URLConnection) input [1];
3995 readBuffer = (char[]) input [2];
3996 readBufferPos = ((Integer) input [3]).intValue ();
3997 readBufferLength = ((Integer) input [4]).intValue ();
3998 line = ((Integer) input [5]).intValue ();
3999 encoding = ((Integer) input [6]).intValue ();
4000 readBufferOverflow = ((Integer) input [7]).intValue ();
4001 is = (InputStream) input [8];
4002 currentByteCount = ((Integer) input [9]).intValue ();
4003 column = ((Integer) input [10]).intValue ();
4004 reader = (Reader) input [11];
4005 }
4006
4007
4008 /***
4009 * Return true if we can read the expected character.
4010 * <p>Note that the character will be removed from the input stream
4011 * on success, but will be put back on failure. Do not attempt to
4012 * read the character again if the method succeeds.
4013 * @param delim The character that should appear next. For a
4014 * insensitive match, you must supply this in upper-case.
4015 * @return true if the character was successfully read, or false if
4016 * it was not.
4017 * @see #tryRead (String)
4018 */
4019 private boolean tryRead (char delim)
4020 throws SAXException, IOException
4021 {
4022 char c;
4023
4024
4025 c = readCh ();
4026
4027
4028
4029 if (c == delim) {
4030 return true;
4031 } else {
4032 unread (c);
4033 return false;
4034 }
4035 }
4036
4037
4038 /***
4039 * Return true if we can read the expected string.
4040 * <p>This is simply a convenience method.
4041 * <p>Note that the string will be removed from the input stream
4042 * on success, but will be put back on failure. Do not attempt to
4043 * read the string again if the method succeeds.
4044 * <p>This method will push back a character rather than an
4045 * array whenever possible (probably the majority of cases).
4046 * <p><b>NOTE:</b> This method currently has a hard-coded limit
4047 * of 100 characters for the delimiter.
4048 * @param delim The string that should appear next.
4049 * @return true if the string was successfully read, or false if
4050 * it was not.
4051 * @see #tryRead (char)
4052 */
4053 private boolean tryRead (String delim)
4054 throws SAXException, IOException
4055 {
4056 char ch[] = delim.toCharArray ();
4057 char c;
4058
4059
4060
4061
4062 for (int i = 0; i < ch.length; i++) {
4063 c = readCh ();
4064 if (c != ch [i]) {
4065 unread (c);
4066 if (i != 0) {
4067 unread (ch, i);
4068 }
4069 return false;
4070 }
4071 }
4072 return true;
4073 }
4074
4075
4076
4077 /***
4078 * Return true if we can read some whitespace.
4079 * <p>This is simply a convenience method.
4080 * <p>This method will push back a character rather than an
4081 * array whenever possible (probably the majority of cases).
4082 * @return true if whitespace was found.
4083 */
4084 private boolean tryWhitespace ()
4085 throws SAXException, IOException
4086 {
4087 char c;
4088 c = readCh ();
4089 if (isWhitespace (c)) {
4090 skipWhitespace ();
4091 return true;
4092 } else {
4093 unread (c);
4094 return false;
4095 }
4096 }
4097
4098
4099 /***
4100 * Read all data until we find the specified string.
4101 * This is useful for scanning CDATA sections and PIs.
4102 * <p>This is inefficient right now, since it calls tryRead ()
4103 * for every character.
4104 * @param delim The string delimiter
4105 * @see #tryRead (String, boolean)
4106 * @see #readCh
4107 */
4108 private void parseUntil (String delim)
4109 throws SAXException, IOException
4110 {
4111 char c;
4112 int startLine = line;
4113
4114 try {
4115 while (!tryRead (delim)) {
4116 c = readCh ();
4117 dataBufferAppend (c);
4118 }
4119 } catch (EOFException e) {
4120 error ("end of input while looking for delimiter "
4121 + "(started on line " + startLine
4122 + ')', null, delim);
4123 }
4124 }
4125
4126
4127 /***
4128 * Read just the encoding declaration (or XML declaration) at the
4129 * start of an external entity.
4130 * When this method is called, we know that the declaration is
4131 * present (or appears to be). We also know that the entity is
4132 * in some sort of ASCII-derived 8-bit encoding.
4133 * The idea of this is to let us read what the 8-bit encoding is
4134 * before we've committed to converting any more of the file; the
4135 * XML or encoding declaration must be in 7-bit ASCII, so we're
4136 * safe as long as we don't go past it.
4137 */
4138 private void read8bitEncodingDeclaration ()
4139 throws SAXException, IOException
4140 {
4141 int ch;
4142 readBufferPos = readBufferLength = 0;
4143
4144 while (true) {
4145 ch = is.read ();
4146 readBuffer [readBufferLength++] = (char) ch;
4147 switch (ch) {
4148 case (int) '>':
4149 return;
4150 case - 1:
4151 error ("end of file before end of XML or encoding declaration.",
4152 null, "?>");
4153 }
4154 if (readBuffer.length == readBufferLength)
4155 error ("unfinished XML or encoding declaration");
4156 }
4157 }
4158
4159
4160
4161
4162
4163
4164
4165 /***
4166 * Read a chunk of data from an external input source.
4167 * <p>This is simply a front-end that fills the rawReadBuffer
4168 * with bytes, then calls the appropriate encoding handler.
4169 * @see #encoding
4170 * @see #rawReadBuffer
4171 * @see #readBuffer
4172 * @see #filterCR
4173 * @see #copyUtf8ReadBuffer
4174 * @see #copyIso8859_1ReadBuffer
4175 * @see #copyUcs_2ReadBuffer
4176 * @see #copyUcs_4ReadBuffer
4177 */
4178 private void readDataChunk ()
4179 throws SAXException, IOException
4180 {
4181 int count, i, j;
4182
4183
4184 if (readBufferOverflow > -1) {
4185 readBuffer [0] = (char) readBufferOverflow;
4186 readBufferOverflow = -1;
4187 readBufferPos = 1;
4188 sawCR = true;
4189 } else {
4190 readBufferPos = 0;
4191 sawCR = false;
4192 }
4193
4194
4195 if (sourceType == INPUT_READER) {
4196 count = reader.read (readBuffer,
4197 readBufferPos, READ_BUFFER_MAX - readBufferPos);
4198 if (count < 0)
4199 readBufferLength = readBufferPos;
4200 else
4201 readBufferLength = readBufferPos + count;
4202 if (readBufferLength > 0)
4203 filterCR (count >= 0);
4204 sawCR = false;
4205 return;
4206 }
4207
4208
4209 count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX);
4210
4211
4212
4213
4214 if (count > 0) {
4215 switch (encoding) {
4216
4217 case ENCODING_ASCII:
4218 copyIso8859_1ReadBuffer (count, (char) 0x0080);
4219 break;
4220 case ENCODING_UTF_8:
4221 copyUtf8ReadBuffer (count);
4222 break;
4223 case ENCODING_ISO_8859_1:
4224 copyIso8859_1ReadBuffer (count, (char) 0);
4225 break;
4226
4227
4228 case ENCODING_UCS_2_12:
4229 copyUcs2ReadBuffer (count, 8, 0);
4230 break;
4231 case ENCODING_UCS_2_21:
4232 copyUcs2ReadBuffer (count, 0, 8);
4233 break;
4234
4235
4236 case ENCODING_UCS_4_1234:
4237 copyUcs4ReadBuffer (count, 24, 16, 8, 0);
4238 break;
4239 case ENCODING_UCS_4_4321:
4240 copyUcs4ReadBuffer (count, 0, 8, 16, 24);
4241 break;
4242 case ENCODING_UCS_4_2143:
4243 copyUcs4ReadBuffer (count, 16, 24, 0, 8);
4244 break;
4245 case ENCODING_UCS_4_3412:
4246 copyUcs4ReadBuffer (count, 8, 0, 24, 16);
4247 break;
4248 }
4249 } else
4250 readBufferLength = readBufferPos;
4251
4252 readBufferPos = 0;
4253
4254
4255
4256 if (sawCR) {
4257 filterCR (count >= 0);
4258 sawCR = false;
4259
4260
4261 if (readBufferLength == 0 && count >= 0)
4262 readDataChunk ();
4263 }
4264
4265 if (count > 0)
4266 currentByteCount += count;
4267 }
4268
4269
4270 /***
4271 * Filter carriage returns in the read buffer.
4272 * CRLF becomes LF; CR becomes LF.
4273 * @param moreData true iff more data might come from the same source
4274 * @see #readDataChunk
4275 * @see #readBuffer
4276 * @see #readBufferOverflow
4277 */
4278 private void filterCR (boolean moreData)
4279 {
4280 int i, j;
4281
4282 readBufferOverflow = -1;
4283
4284 loop:
4285 for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4286 switch (readBuffer [j]) {
4287 case '\r':
4288 if (j == readBufferLength - 1) {
4289 if (moreData) {
4290 readBufferOverflow = '\r';
4291 readBufferLength--;
4292 } else
4293 readBuffer [i++] = '\n';
4294 break loop;
4295 } else if (readBuffer [j + 1] == '\n') {
4296 j++;
4297 }
4298 readBuffer [i] = '\n';
4299 break;
4300
4301 case '\n':
4302 default:
4303 readBuffer [i] = readBuffer [j];
4304 break;
4305 }
4306 }
4307 readBufferLength = i;
4308 }
4309
4310 /***
4311 * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
4312 * <p>When readDataChunk () calls this method, the raw bytes are in
4313 * rawReadBuffer, and the final characters will appear in
4314 * readBuffer.
4315 * @param count The number of bytes to convert.
4316 * @see #readDataChunk
4317 * @see #rawReadBuffer
4318 * @see #readBuffer
4319 * @see #getNextUtf8Byte
4320 */
4321 private void copyUtf8ReadBuffer (int count)
4322 throws SAXException, IOException
4323 {
4324 int i = 0;
4325 int j = readBufferPos;
4326 int b1;
4327 char c = 0;
4328
4329
4330
4331
4332
4333
4334
4335 while (i < count) {
4336 b1 = rawReadBuffer [i++];
4337
4338
4339
4340
4341 if (b1 < 0) {
4342 if ((b1 & 0xe0) == 0xc0) {
4343
4344 c = (char) (((b1 & 0x1f) << 6)
4345 | getNextUtf8Byte (i++, count));
4346 } else if ((b1 & 0xf0) == 0xe0) {
4347
4348
4349
4350 c = (char) (((b1 & 0x0f) << 12) |
4351 (getNextUtf8Byte (i++, count) << 6) |
4352 getNextUtf8Byte (i++, count));
4353 } else if ((b1 & 0xf8) == 0xf0) {
4354
4355
4356
4357
4358 int iso646 = b1 & 07;
4359 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4360 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4361 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4362
4363 if (iso646 <= 0xffff) {
4364 c = (char) iso646;
4365 } else {
4366 if (iso646 > 0x0010ffff)
4367 encodingError (
4368 "UTF-8 value out of range for Unicode",
4369 iso646, 0);
4370 iso646 -= 0x010000;
4371 readBuffer [j++] = (char) (0xd800 | (iso646 >> 10));
4372 readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff));
4373 continue;
4374 }
4375 } else {
4376
4377
4378 encodingError (
4379 "unsupported five or six byte UTF-8 sequence",
4380 0xff & b1, i);
4381
4382 c = 0;
4383 }
4384 } else {
4385
4386
4387 c = (char) b1;
4388 }
4389 readBuffer [j++] = c;
4390 if (c == '\r')
4391 sawCR = true;
4392 }
4393
4394 readBufferLength = j;
4395 }
4396
4397
4398 /***
4399 * Return the next byte value in a UTF-8 sequence.
4400 * If it is not possible to get a byte from the current
4401 * entity, throw an exception.
4402 * @param pos The current position in the rawReadBuffer.
4403 * @param count The number of bytes in the rawReadBuffer
4404 * @return The significant six bits of a non-initial byte in
4405 * a UTF-8 sequence.
4406 * @exception EOFException If the sequence is incomplete.
4407 */
4408 private int getNextUtf8Byte (int pos, int count)
4409 throws SAXException, IOException
4410 {
4411 int val;
4412
4413
4414
4415 if (pos < count) {
4416 val = rawReadBuffer [pos];
4417 } else {
4418 val = is.read ();
4419 if (val == -1) {
4420 encodingError ("unfinished multi-byte UTF-8 sequence at EOF",
4421 -1, pos);
4422 }
4423 }
4424
4425
4426 if ((val & 0xc0) != 0x80) {
4427 encodingError ("bad continuation of multi-byte UTF-8 sequence",
4428 val, pos + 1);
4429 }
4430
4431
4432 return (val & 0x3f);
4433 }
4434
4435
4436 /***
4437 * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
4438 * UTF-16 characters.
4439 *
4440 * <p>When readDataChunk () calls this method, the raw bytes are in
4441 * rawReadBuffer, and the final characters will appear in
4442 * readBuffer.
4443 *
4444 * @param count The number of bytes to convert.
4445 * @param mask For ASCII conversion, 0x7f; else, 0xff.
4446 * @see #readDataChunk
4447 * @see #rawReadBuffer
4448 * @see #readBuffer
4449 */
4450 private void copyIso8859_1ReadBuffer (int count, char mask)
4451 throws IOException
4452 {
4453 int i, j;
4454 for (i = 0, j = readBufferPos; i < count; i++, j++) {
4455 char c = (char) (rawReadBuffer [i] & 0xff);
4456 if ((c & mask) != 0)
4457 throw new CharConversionException ("non-ASCII character U+"
4458 + Integer.toHexString (c));
4459 readBuffer [j] = c;
4460 if (c == '\r') {
4461 sawCR = true;
4462 }
4463 }
4464 readBufferLength = j;
4465 }
4466
4467
4468 /***
4469 * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
4470 * (as used in Java string manipulation).
4471 *
4472 * <p>When readDataChunk () calls this method, the raw bytes are in
4473 * rawReadBuffer, and the final characters will appear in
4474 * readBuffer.
4475 * @param count The number of bytes to convert.
4476 * @param shift1 The number of bits to shift byte 1.
4477 * @param shift2 The number of bits to shift byte 2
4478 * @see #readDataChunk
4479 * @see #rawReadBuffer
4480 * @see #readBuffer
4481 */
4482 private void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4483 throws SAXException
4484 {
4485 int j = readBufferPos;
4486
4487 if (count > 0 && (count % 2) != 0) {
4488 encodingError ("odd number of bytes in UCS-2 encoding", -1, count);
4489 }
4490
4491 if (shift1 == 0) {
4492 for (int i = 0; i < count; i += 2) {
4493 char c = (char) (rawReadBuffer [i + 1] << 8);
4494 c |= 0xff & rawReadBuffer [i];
4495 readBuffer [j++] = c;
4496 if (c == '\r')
4497 sawCR = true;
4498 }
4499 } else {
4500 for (int i = 0; i < count; i += 2) {
4501 char c = (char) (rawReadBuffer [i] << 8);
4502 c |= 0xff & rawReadBuffer [i + 1];
4503 readBuffer [j++] = c;
4504 if (c == '\r')
4505 sawCR = true;
4506 }
4507 }
4508 readBufferLength = j;
4509 }
4510
4511
4512 /***
4513 * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4514 *
4515 * <p>When readDataChunk () calls this method, the raw bytes are in
4516 * rawReadBuffer, and the final characters will appear in
4517 * readBuffer.
4518 * <p>Java has Unicode chars, and this routine uses surrogate pairs
4519 * for ISO-10646 values between 0x00010000 and 0x000fffff. An
4520 * exception is thrown if the ISO-10646 character has no Unicode
4521 * representation.
4522 *
4523 * @param count The number of bytes to convert.
4524 * @param shift1 The number of bits to shift byte 1.
4525 * @param shift2 The number of bits to shift byte 2
4526 * @param shift3 The number of bits to shift byte 2
4527 * @param shift4 The number of bits to shift byte 2
4528 * @see #readDataChunk
4529 * @see #rawReadBuffer
4530 * @see #readBuffer
4531 */
4532 private void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4533 int shift3, int shift4)
4534 throws SAXException
4535 {
4536 int j = readBufferPos;
4537 int value;
4538
4539 if (count > 0 && (count % 4) != 0) {
4540 encodingError (
4541 "number of bytes in UCS-4 encoding not divisible by 4",
4542 -1, count);
4543 }
4544 for (int i = 0; i < count; i += 4) {
4545 value = (((rawReadBuffer [i] & 0xff) << shift1) |
4546 ((rawReadBuffer [i + 1] & 0xff) << shift2) |
4547 ((rawReadBuffer [i + 2] & 0xff) << shift3) |
4548 ((rawReadBuffer [i + 3] & 0xff) << shift4));
4549 if (value < 0x0000ffff) {
4550 readBuffer [j++] = (char) value;
4551 if (value == (int) '\r') {
4552 sawCR = true;
4553 }
4554 } else if (value < 0x0010ffff) {
4555 value -= 0x010000;
4556 readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
4557 readBuffer [j++] = (char) (0xdc | (value & 0x03ff));
4558 } else {
4559 encodingError ("UCS-4 value out of range for Unicode",
4560 value, i);
4561 }
4562 }
4563 readBufferLength = j;
4564 }
4565
4566
4567 /***
4568 * Report a character encoding error.
4569 */
4570 private void encodingError (String message, int value, int offset)
4571 throws SAXException
4572 {
4573 String uri;
4574
4575 if (value != -1) {
4576 message = message + " (character code: 0x" +
4577 Integer.toHexString (value) + ')';
4578 }
4579 if (externalEntity != null) {
4580 uri = externalEntity.getURL ().toString ();
4581 } else {
4582 uri = baseURI;
4583 }
4584 handler.error (message, uri, -1, offset + currentByteCount);
4585 }
4586
4587
4588
4589
4590
4591
4592 /***
4593 * Re-initialize the variables for each parse.
4594 */
4595 private void initializeVariables ()
4596 {
4597
4598 line = 1;
4599 column = 0;
4600
4601
4602 dataBufferPos = 0;
4603 dataBuffer = new char [DATA_BUFFER_INITIAL];
4604 nameBufferPos = 0;
4605 nameBuffer = new char [NAME_BUFFER_INITIAL];
4606
4607
4608 elementInfo = new HashMap ();
4609 entityInfo = new HashMap ();
4610 notationInfo = new HashMap ();
4611
4612
4613
4614 currentElement = null;
4615 currentElementContent = CONTENT_UNDECLARED;
4616
4617
4618 sourceType = INPUT_NONE;
4619 inputStack = new ArrayList ();
4620 entityStack = new ArrayList ();
4621 externalEntity = null;
4622 tagAttributePos = 0;
4623 tagAttributes = new String [100];
4624 rawReadBuffer = new byte [READ_BUFFER_MAX];
4625 readBufferOverflow = -1;
4626
4627 inLiteral = false;
4628 expandPE = false;
4629 peIsError = false;
4630
4631 inCDATA = false;
4632
4633 symbolTable = new Object [SYMBOL_TABLE_LENGTH][];
4634 }
4635
4636
4637 /***
4638 * Clean up after the parse to allow some garbage collection.
4639 */
4640 private void cleanupVariables ()
4641 {
4642 dataBuffer = null;
4643 nameBuffer = null;
4644
4645 elementInfo = null;
4646 entityInfo = null;
4647 notationInfo = null;
4648
4649 currentElement = null;
4650
4651 inputStack = null;
4652 entityStack = null;
4653 externalEntity = null;
4654
4655 tagAttributes = null;
4656 rawReadBuffer = null;
4657
4658 symbolTable = null;
4659 }
4660
4661
4662
4663
4664 private SAXDriver handler;
4665
4666
4667
4668
4669 private Reader reader;
4670 private InputStream is;
4671 private int line;
4672 private int column;
4673 private int sourceType;
4674 private ArrayList inputStack;
4675 private URLConnection externalEntity;
4676 private int encoding;
4677 private int currentByteCount;
4678
4679
4680
4681
4682 private char readBuffer [];
4683 private int readBufferPos;
4684 private int readBufferLength;
4685 private int readBufferOverflow;
4686
4687
4688
4689
4690
4691 private final static int READ_BUFFER_MAX = 16384;
4692 private byte rawReadBuffer [];
4693
4694
4695
4696
4697
4698 private static int DATA_BUFFER_INITIAL = 4096;
4699 private char dataBuffer [];
4700 private int dataBufferPos;
4701
4702
4703
4704
4705 private static int NAME_BUFFER_INITIAL = 1024;
4706 private char nameBuffer [];
4707 private int nameBufferPos;
4708
4709
4710
4711
4712
4713 private HashMap elementInfo;
4714 private HashMap entityInfo;
4715 private HashMap notationInfo;
4716
4717
4718
4719
4720
4721 private String currentElement;
4722 private int currentElementContent;
4723
4724
4725
4726
4727 private String basePublicId;
4728 private String baseURI;
4729 private int baseEncoding;
4730 private Reader baseReader;
4731 private InputStream baseInputStream;
4732 private char baseInputBuffer [];
4733 private int baseInputBufferStart;
4734 private int baseInputBufferLength;
4735
4736
4737
4738
4739 private ArrayList entityStack;
4740
4741
4742
4743
4744
4745 private boolean inLiteral;
4746 private boolean expandPE;
4747 private boolean peIsError;
4748
4749
4750
4751
4752 private final static int SYMBOL_TABLE_LENGTH = 1087;
4753 private Object symbolTable [][];
4754
4755
4756
4757
4758 private String tagAttributes [];
4759 private int tagAttributePos;
4760
4761
4762
4763
4764
4765
4766 private boolean sawCR;
4767
4768
4769
4770
4771 private boolean inCDATA;
4772 }
4773