1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49 package org.dom4j.io.aelfred2;
50
51 import java.io.BufferedInputStream;
52 import java.io.CharConversionException;
53 import java.io.EOFException;
54 import java.io.InputStream;
55 import java.io.InputStreamReader;
56 import java.io.IOException;
57 import java.io.Reader;
58 import java.io.UnsupportedEncodingException;
59 import java.net.URL;
60 import java.net.URLConnection;
61
62
63
64 import java.util.Enumeration;
65 import java.util.Hashtable;
66 import java.util.Stack;
67
68 import org.xml.sax.InputSource;
69 import org.xml.sax.SAXException;
70
71
72 /***
73 * Parse XML documents and return parse events through call-backs.
74 * Use the <code>SAXDriver</code> class as your entry point, as all
75 * internal parser interfaces are subject to change.
76 *
77 * @author Written by David Megginson <dmeggins@microstar.com>
78 * (version 1.2a with bugfixes)
79 * @author Updated by David Brownell <dbrownell@users.sourceforge.net>
80 * @see SAXDriver
81 */
82 final class XmlParser
83 {
84
85 private final static boolean USE_CHEATS = true;
86
87
88
89
90
91
92
93 /***
94 * Construct a new parser with no associated handler.
95 * @see #setHandler
96 * @see #parse
97 */
98
99 XmlParser ()
100 {
101 }
102
103
104 /***
105 * Set the handler that will receive parsing events.
106 * @param handler The handler to receive callback events.
107 * @see #parse
108 */
109
110 void setHandler (SAXDriver handler)
111 {
112 this.handler = handler;
113 }
114
115
116 /***
117 * Parse an XML document from the character stream, byte stream, or URI
118 * that you provide (in that order of preference). Any URI that you
119 * supply will become the base URI for resolving relative URI, and may
120 * be used to acquire a reader or byte stream.
121 *
122 * <p> Only one thread at a time may use this parser; since it is
123 * private to this package, post-parse cleanup is done by the caller,
124 * which MUST NOT REUSE the parser (just null it).
125 *
126 * @param systemId Absolute URI of the document; should never be null,
127 * but may be so iff a reader <em>or</em> a stream is provided.
128 * @param publicId The public identifier of the document, or null.
129 * @param reader A character stream; must be null if stream isn't.
130 * @param stream A byte input stream; must be null if reader isn't.
131 * @param encoding The suggested encoding, or null if unknown.
132 * @exception java.lang.Exception Basically SAXException or IOException
133 */
134
135 void doParse (
136 String systemId,
137 String publicId,
138 Reader reader,
139 InputStream stream,
140 String encoding
141 ) throws Exception
142 {
143 if (handler == null)
144 throw new IllegalStateException ("no callback handler");
145
146 initializeVariables ();
147
148
149
150
151 setInternalEntity ("amp", "&");
152 setInternalEntity ("lt", "<");
153 setInternalEntity ("gt", ">");
154 setInternalEntity ("apos", "'");
155 setInternalEntity ("quot", """);
156
157 try {
158
159
160 handler.startDocument ();
161 pushURL (false, "[document]",
162
163 new String [] { publicId, systemId, null},
164 reader, stream, encoding, false);
165
166 parseDocument ();
167 } catch (EOFException e){
168
169 error("empty document, with no root element.");
170 }finally {
171 if (reader != null)
172 try { reader.close ();
173 } catch (IOException e) {
174 if (stream != null)
175 try { stream.close ();
176 } catch (IOException e) {
177 if (is != null)
178 try { is.close ();
179 } catch (IOException e) {
180 if (reader != null)
181 try {
182 reader.close ();
183 } catch (IOException e) {
184 }
185 scratch = null;
186 }
187 }
188
189
190
191
192
193
194
195
196
197
198 /***
199 * Constant: an element has not been declared.
200 * @see #getElementContentType
201 */
202 public final static int CONTENT_UNDECLARED = 0;
203
204 /***
205 * Constant: the element has a content model of ANY.
206 * @see #getElementContentType
207 */
208 public final static int CONTENT_ANY = 1;
209
210 /***
211 * Constant: the element has declared content of EMPTY.
212 * @see #getElementContentType
213 */
214 public final static int CONTENT_EMPTY = 2;
215
216 /***
217 * Constant: the element has mixed content.
218 * @see #getElementContentType
219 */
220 public final static int CONTENT_MIXED = 3;
221
222 /***
223 * Constant: the element has element content.
224 * @see #getElementContentType
225 */
226 public final static int CONTENT_ELEMENTS = 4;
227
228
229
230
231
232
233 /***
234 * Constant: the entity has not been declared.
235 * @see #getEntityType
236 */
237 public final static int ENTITY_UNDECLARED = 0;
238
239 /***
240 * Constant: the entity is internal.
241 * @see #getEntityType
242 */
243 public final static int ENTITY_INTERNAL = 1;
244
245 /***
246 * Constant: the entity is external, non-parsable data.
247 * @see #getEntityType
248 */
249 public final static int ENTITY_NDATA = 2;
250
251 /***
252 * Constant: the entity is external XML data.
253 * @see #getEntityType
254 */
255 public final static int ENTITY_TEXT = 3;
256
257
258
259
260
261
262
263
264
265 private final static int ENCODING_EXTERNAL = 0;
266 private final static int ENCODING_UTF_8 = 1;
267 private final static int ENCODING_ISO_8859_1 = 2;
268 private final static int ENCODING_UCS_2_12 = 3;
269 private final static int ENCODING_UCS_2_21 = 4;
270 private final static int ENCODING_UCS_4_1234 = 5;
271 private final static int ENCODING_UCS_4_4321 = 6;
272 private final static int ENCODING_UCS_4_2143 = 7;
273 private final static int ENCODING_UCS_4_3412 = 8;
274 private final static int ENCODING_ASCII = 9;
275
276
277
278
279
280
281 /***
282 * Constant: the attribute is not declared.
283 * @see #getAttributeDefaultValueType
284 */
285 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
286
287 /***
288 * Constant: the attribute has a literal default value specified.
289 * @see #getAttributeDefaultValueType
290 * @see #getAttributeDefaultValue
291 */
292 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
293
294 /***
295 * Constant: the attribute was declared #IMPLIED.
296 * @see #getAttributeDefaultValueType
297 */
298 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
299
300 /***
301 * Constant: the attribute was declared #REQUIRED.
302 * @see #getAttributeDefaultValueType
303 */
304 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
305
306 /***
307 * Constant: the attribute was declared #FIXED.
308 * @see #getAttributeDefaultValueType
309 * @see #getAttributeDefaultValue
310 */
311 public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
312
313
314
315
316
317 private final static int INPUT_NONE = 0;
318 private final static int INPUT_INTERNAL = 1;
319 private final static int INPUT_STREAM = 3;
320 private final static int INPUT_READER = 5;
321
322
323
324
325
326
327 private final static int LIT_ENTITY_REF = 2;
328
329 private final static int LIT_NORMALIZE = 4;
330
331 private final static int LIT_ATTRIBUTE = 8;
332
333 private final static int LIT_DISABLE_PE = 16;
334
335 private final static int LIT_DISABLE_CREF = 32;
336
337 private final static int LIT_DISABLE_EREF = 64;
338
339 private final static int LIT_PUBID = 256;
340
341
342
343
344
345
346 private final static int CONTEXT_NORMAL = 0;
347 private final static int CONTEXT_LITERAL = 1;
348
349
350
351
352
353
354
355 /***
356 * Report an error.
357 * @param message The error message.
358 * @param textFound The text that caused the error (or null).
359 * @see SAXDriver#error
360 * @see #line
361 */
362 private void error (String message, String textFound, String textExpected)
363 throws SAXException
364 {
365 if (textFound != null) {
366 message = message + " (found \"" + textFound + "\")";
367 }
368 if (textExpected != null) {
369 message = message + " (expected \"" + textExpected + "\")";
370 }
371 handler.fatal (message);
372
373
374 throw new SAXException (message);
375 }
376
377
378 /***
379 * Report a serious error.
380 * @param message The error message.
381 * @param textFound The text that caused the error (or null).
382 */
383 private void error (String message, char textFound, String textExpected)
384 throws SAXException
385 {
386 error (message, new Character (textFound).toString (), textExpected);
387 }
388
389 /*** Report typical case fatal errors. */
390 private void error (String message)
391 throws SAXException
392 {
393 handler.fatal (message);
394 }
395
396
397
398
399
400
401
402 /***
403 * Parse an XML document.
404 * <pre>
405 * [1] document ::= prolog element Misc*
406 * </pre>
407 * <p>This is the top-level parsing function for a single XML
408 * document. As a minimum, a well-formed document must have
409 * a document element, and a valid document must have a prolog
410 * (one with doctype) as well.
411 */
412 private void parseDocument ()
413 throws Exception
414 {
415 try {
416 boolean sawDTD = parseProlog ();
417 require ('<');
418 parseElement (!sawDTD);
419 } catch (EOFException ee) {
420 error("premature end of file", "[EOF]", null);
421 }
422
423 try {
424 parseMisc ();
425 char c = readCh ();
426 error ("unexpected characters after document end", c, null);
427 } catch (EOFException e) {
428 return;
429 }
430 }
431
432 static final char startDelimComment [] = { '<', '!', '-', '-' };
433 static final char endDelimComment [] = { '-', '-' };
434
435 /***
436 * Skip a comment.
437 * <pre>
438 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
439 * </pre>
440 * <p> (The <code><!--</code> has already been read.)
441 */
442 private void parseComment ()
443 throws Exception
444 {
445 char c;
446 boolean saved = expandPE;
447
448 expandPE = false;
449 parseUntil (endDelimComment);
450 require ('>');
451 expandPE = saved;
452 handler.comment (dataBuffer, 0, dataBufferPos);
453 dataBufferPos = 0;
454 }
455
456 static final char startDelimPI [] = { '<', '?' };
457 static final char endDelimPI [] = { '?', '>' };
458
459 /***
460 * Parse a processing instruction and do a call-back.
461 * <pre>
462 * [16] PI ::= '<?' PITarget
463 * (S (Char* - (Char* '?>' Char*)))?
464 * '?>'
465 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
466 * </pre>
467 * <p> (The <code><?</code> has already been read.)
468 */
469 private void parsePI ()
470 throws SAXException, IOException
471 {
472 String name;
473 boolean saved = expandPE;
474
475 expandPE = false;
476 name = readNmtoken (true);
477
478 if (name.indexOf(':') >= 0)
479 error ("Illegal character(':') in processing instruction name ", name, null);
480 if ("xml".equalsIgnoreCase (name))
481 error ("Illegal processing instruction target", name, null);
482 if (!tryRead (endDelimPI)) {
483 requireWhitespace ();
484 parseUntil (endDelimPI);
485 }
486 expandPE = saved;
487 handler.processingInstruction (name, dataBufferToString ());
488 }
489
490
491 static final char endDelimCDATA [] = { ']', ']', '>' };
492
493 private boolean isDirtyCurrentElement;
494
495 /***
496 * Parse a CDATA section.
497 * <pre>
498 * [18] CDSect ::= CDStart CData CDEnd
499 * [19] CDStart ::= '<![CDATA['
500 * [20] CData ::= (Char* - (Char* ']]>' Char*))
501 * [21] CDEnd ::= ']]>'
502 * </pre>
503 * <p> (The '<![CDATA[' has already been read.)
504 */
505 private void parseCDSect ()
506 throws Exception
507 {
508 parseUntil (endDelimCDATA);
509 dataBufferFlush ();
510 }
511
512
513 /***
514 * Parse the prolog of an XML document.
515 * <pre>
516 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
517 * </pre>
518 * <p>We do not look for the XML declaration here, because it was
519 * handled by pushURL ().
520 * @see pushURL
521 * @return true if a DTD was read.
522 */
523 private boolean parseProlog ()
524 throws Exception
525 {
526 parseMisc ();
527
528 if (tryRead ("<!DOCTYPE")) {
529 parseDoctypedecl ();
530 parseMisc ();
531 return true;
532 }
533 return false;
534 }
535
536 private void checkLegalVersion (String version)
537 throws SAXException
538 {
539 int len = version.length ();
540 for (int i = 0; i < len; i++) {
541 char c = version.charAt (i);
542 if ('0' <= c && c <= '9')
543 continue;
544 if (c == '_' || c == '.' || c == ':' || c == '-')
545 continue;
546 if ('a' <= c && c <= 'z')
547 continue;
548 if ('A' <= c && c <= 'Z')
549 continue;
550 error ("illegal character in version", version, "1.0");
551 }
552 }
553
554
555 /***
556 * Parse the XML declaration.
557 * <pre>
558 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
559 * [24] VersionInfo ::= S 'version' Eq
560 * ("'" VersionNum "'" | '"' VersionNum '"' )
561 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
562 * [32] SDDecl ::= S 'standalone' Eq
563 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
564 * [80] EncodingDecl ::= S 'encoding' Eq
565 * ( "'" EncName "'" | "'" EncName "'" )
566 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
567 * </pre>
568 * <p> (The <code><?xml</code> and whitespace have already been read.)
569 * @return the encoding in the declaration, uppercased; or null
570 * @see #parseTextDecl
571 * @see #setupDecoding
572 */
573 private String parseXMLDecl (boolean ignoreEncoding)
574 throws SAXException, IOException
575 {
576 String version;
577 String encodingName = null;
578 String standalone = null;
579 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
580
581
582 require ("version");
583 parseEq ();
584 checkLegalVersion (version = readLiteral (flags));
585 if (!version.equals ("1.0")){
586 if(version.equals ("1.1")){
587 handler.warn ("expected XML version 1.0, not: " + version);
588 xmlVersion = XML_11;
589 }else {
590 error("illegal XML version", version, "1.0 or 1.1");
591 }
592 }
593 else
594 xmlVersion = XML_10;
595
596 boolean white = tryWhitespace ();
597
598 if (tryRead ("encoding")) {
599 if (!white)
600 error ("whitespace required before 'encoding='");
601 parseEq ();
602 encodingName = readLiteral (flags);
603 if (!ignoreEncoding)
604 setupDecoding (encodingName);
605 }
606
607
608 if (encodingName != null)
609 white = tryWhitespace ();
610 if (tryRead ("standalone")) {
611 if (!white)
612 error ("whitespace required before 'standalone='");
613 parseEq ();
614 standalone = readLiteral (flags);
615 if ("yes".equals (standalone))
616 docIsStandalone = true;
617 else if (!"no".equals (standalone))
618 error ("standalone flag must be 'yes' or 'no'");
619 }
620
621 skipWhitespace ();
622 require ("?>");
623
624 return encodingName;
625 }
626
627
628 /***
629 * Parse a text declaration.
630 * <pre>
631 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
632 * [80] EncodingDecl ::= S 'encoding' Eq
633 * ( '"' EncName '"' | "'" EncName "'" )
634 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
635 * </pre>
636 * <p> (The <code><?xml</code>' and whitespace have already been read.)
637 * @return the encoding in the declaration, uppercased; or null
638 * @see #parseXMLDecl
639 * @see #setupDecoding
640 */
641 private String parseTextDecl (boolean ignoreEncoding)
642 throws SAXException, IOException
643 {
644 String encodingName = null;
645 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
646
647
648 if (tryRead ("version")) {
649 String version;
650 parseEq ();
651 checkLegalVersion (version = readLiteral (flags));
652
653 if (version.equals ("1.1")){
654 if (xmlVersion == XML_10){
655 error ("external subset has later version number.", "1.0", version);
656 }
657 handler.warn ("expected XML version 1.0, not: " + version);
658 xmlVersion = XML_11;
659 }else if(!version.equals ("1.0")) {
660 error("illegal XML version", version, "1.0 or 1.1");
661 }
662 requireWhitespace ();
663 }
664
665
666
667 require ("encoding");
668 parseEq ();
669 encodingName = readLiteral (flags);
670 if (!ignoreEncoding)
671 setupDecoding (encodingName);
672
673 skipWhitespace ();
674 require ("?>");
675
676 return encodingName;
677 }
678
679
680 /***
681 * Sets up internal state so that we can decode an entity using the
682 * specified encoding. This is used when we start to read an entity
683 * and we have been given knowledge of its encoding before we start to
684 * read any data (e.g. from a SAX input source or from a MIME type).
685 *
686 * <p> It is also used after autodetection, at which point only very
687 * limited adjustments to the encoding may be used (switching between
688 * related builtin decoders).
689 *
690 * @param encodingName The name of the encoding specified by the user.
691 * @exception IOException if the encoding isn't supported either
692 * internally to this parser, or by the hosting JVM.
693 * @see #parseXMLDecl
694 * @see #parseTextDecl
695 */
696 private void setupDecoding (String encodingName)
697 throws SAXException, IOException
698 {
699 encodingName = encodingName.toUpperCase ();
700
701
702
703
704
705
706
707
708
709 if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
710 if (encodingName.equals ("ISO-8859-1")
711 || encodingName.equals ("8859_1")
712 || encodingName.equals ("ISO8859_1")
713 ) {
714 encoding = ENCODING_ISO_8859_1;
715 return;
716 } else if (encodingName.equals ("US-ASCII")
717 || encodingName.equals ("ASCII")) {
718 encoding = ENCODING_ASCII;
719 return;
720 } else if (encodingName.equals ("UTF-8")
721 || encodingName.equals ("UTF8")) {
722 encoding = ENCODING_UTF_8;
723 return;
724 } else if (encoding != ENCODING_EXTERNAL) {
725
726 throw new UnsupportedEncodingException (encodingName);
727 }
728
729
730 }
731
732
733 if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
734 if (!(encodingName.equals ("ISO-10646-UCS-2")
735 || encodingName.equals ("UTF-16")
736 || encodingName.equals ("UTF-16BE")
737 || encodingName.equals ("UTF-16LE")))
738 error ("unsupported Unicode encoding",
739 encodingName,
740 "UTF-16");
741 return;
742 }
743
744
745 if (encoding == ENCODING_UCS_4_1234
746 || encoding == ENCODING_UCS_4_4321
747 || encoding == ENCODING_UCS_4_2143
748 || encoding == ENCODING_UCS_4_3412) {
749
750 if (!encodingName.equals ("ISO-10646-UCS-4"))
751 error ("unsupported 32-bit encoding",
752 encodingName,
753 "ISO-10646-UCS-4");
754 return;
755 }
756
757
758
759
760
761 if (encodingName.equals ("UTF-16BE")) {
762 encoding = ENCODING_UCS_2_12;
763 return;
764 }
765 if (encodingName.equals ("UTF-16LE")) {
766 encoding = ENCODING_UCS_2_21;
767 return;
768 }
769
770
771
772
773
774 if (encodingName.equals ("UTF-16")
775 || encodingName.equals ("ISO-10646-UCS-2"))
776 encodingName = "Unicode";
777
778
779 reader = new InputStreamReader (is, encodingName);
780 sourceType = INPUT_READER;
781 }
782
783
784 /***
785 * Parse miscellaneous markup outside the document element and DOCTYPE
786 * declaration.
787 * <pre>
788 * [27] Misc ::= Comment | PI | S
789 * </pre>
790 */
791 private void parseMisc ()
792 throws Exception
793 {
794 while (true) {
795 skipWhitespace ();
796 if (tryRead (startDelimPI)) {
797 parsePI ();
798 } else if (tryRead (startDelimComment)) {
799 parseComment ();
800 } else {
801 return;
802 }
803 }
804 }
805
806
807 /***
808 * Parse a document type declaration.
809 * <pre>
810 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
811 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
812 * </pre>
813 * <p> (The <code><!DOCTYPE</code> has already been read.)
814 */
815 private void parseDoctypedecl ()
816 throws Exception
817 {
818 String rootName, ids[];
819
820
821 requireWhitespace ();
822 rootName = readNmtoken (true);
823
824
825 skipWhitespace ();
826 ids = readExternalIds (false, true);
827
828
829 handler.doctypeDecl (rootName, ids [0], ids [1]);
830
831
832 skipWhitespace ();
833 if (tryRead ('[')) {
834
835
836 while (true) {
837 doReport = expandPE = true;
838 skipWhitespace ();
839 doReport = expandPE = false;
840 if (tryRead (']')) {
841 break;
842 } else {
843
844 peIsError = expandPE = true;
845 parseMarkupdecl ();
846 peIsError = expandPE = false;
847 }
848 }
849 }
850 skipWhitespace ();
851 require ('>');
852
853
854 InputSource subset;
855
856 if (ids [1] == null)
857 subset = handler.getExternalSubset (rootName,
858 handler.getSystemId ());
859 else
860 subset = null;
861 if (ids [1] != null || subset != null) {
862 pushString (null, ">");
863
864
865
866 if (ids [1] != null)
867 pushURL (true, "[dtd]", ids, null, null, null, true);
868 else {
869 handler.warn ("modifying document by adding external subset");
870 pushURL (true, "[dtd]",
871 new String [] { subset.getPublicId (),
872 subset.getSystemId (), null },
873 subset.getCharacterStream (),
874 subset.getByteStream (),
875 subset.getEncoding (),
876 false);
877 }
878
879
880 while (true) {
881 doReport = expandPE = true;
882 skipWhitespace ();
883 doReport = expandPE = false;
884 if (tryRead ('>')) {
885 break;
886 } else {
887 expandPE = true;
888 parseMarkupdecl ();
889 expandPE = false;
890 }
891 }
892
893
894 if (inputStack.size () != 1)
895 error ("external subset has unmatched '>'");
896 }
897
898
899 handler.endDoctype ();
900 expandPE = false;
901 doReport = true;
902 }
903
904
905 /***
906 * Parse a markup declaration in the internal or external DTD subset.
907 * <pre>
908 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
909 * | NotationDecl | PI | Comment
910 * [30] extSubsetDecl ::= (markupdecl | conditionalSect
911 * | PEReference | S) *
912 * </pre>
913 * <p> Reading toplevel PE references is handled as a lexical issue
914 * by the caller, as is whitespace.
915 */
916 private void parseMarkupdecl ()
917 throws Exception
918 {
919 char saved [] = null;
920 boolean savedPE = expandPE;
921
922
923 require ('<');
924 unread ('<');
925 expandPE = false;
926
927 if (tryRead ("<!ELEMENT")) {
928 saved = readBuffer;
929 expandPE = savedPE;
930 parseElementDecl ();
931 } else if (tryRead ("<!ATTLIST")) {
932 saved = readBuffer;
933 expandPE = savedPE;
934 parseAttlistDecl ();
935 } else if (tryRead ("<!ENTITY")) {
936 saved = readBuffer;
937 expandPE = savedPE;
938 parseEntityDecl ();
939 } else if (tryRead ("<!NOTATION")) {
940 saved = readBuffer;
941 expandPE = savedPE;
942 parseNotationDecl ();
943 } else if (tryRead (startDelimPI)) {
944 saved = readBuffer;
945 expandPE = savedPE;
946 parsePI ();
947 } else if (tryRead (startDelimComment)) {
948 saved = readBuffer;
949 expandPE = savedPE;
950 parseComment ();
951 } else if (tryRead ("<![")) {
952 saved = readBuffer;
953 expandPE = savedPE;
954 if (inputStack.size () > 0)
955 parseConditionalSect (saved);
956 else
957 error ("conditional sections illegal in internal subset");
958 } else {
959 error ("expected markup declaration");
960 }
961
962
963 if (readBuffer != saved)
964 handler.verror ("Illegal Declaration/PE nesting");
965 }
966
967
968 /***
969 * Parse an element, with its tags.
970 * <pre>
971 * [39] element ::= EmptyElementTag | STag content ETag
972 * [40] STag ::= '<' Name (S Attribute)* S? '>'
973 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>'
974 * </pre>
975 * <p> (The '<' has already been read.)
976 * <p>NOTE: this method actually chains onto parseContent (), if necessary,
977 * and parseContent () will take care of calling parseETag ().
978 */
979 private void parseElement (boolean maybeGetSubset)
980 throws Exception
981 {
982 String gi;
983 char c;
984 int oldElementContent = currentElementContent;
985 String oldElement = currentElement;
986 Object element [];
987
988
989
990 tagAttributePos = 0;
991
992
993 gi = readNmtoken (true);
994
995
996
997 if (maybeGetSubset) {
998 InputSource subset = handler.getExternalSubset (gi,
999 handler.getSystemId ());
1000 if (subset != null) {
1001 String publicId = subset.getPublicId ();
1002 String systemId = subset.getSystemId ();
1003
1004 handler.warn ("modifying document by adding DTD");
1005 handler.doctypeDecl (gi, publicId, systemId);
1006 pushString (null, ">");
1007
1008
1009
1010 pushURL (true, "[dtd]",
1011 new String [] { publicId, systemId, null },
1012 subset.getCharacterStream (),
1013 subset.getByteStream (),
1014 subset.getEncoding (),
1015 false);
1016
1017
1018 while (true) {
1019 doReport = expandPE = true;
1020 skipWhitespace ();
1021 doReport = expandPE = false;
1022 if (tryRead ('>')) {
1023 break;
1024 } else {
1025 expandPE = true;
1026 parseMarkupdecl ();
1027 expandPE = false;
1028 }
1029 }
1030
1031
1032 if (inputStack.size () != 1)
1033 error ("external subset has unmatched '>'");
1034
1035 handler.endDoctype ();
1036 }
1037 }
1038
1039
1040 currentElement = gi;
1041 element = (Object []) elementInfo.get (gi);
1042 currentElementContent = getContentType (element, CONTENT_ANY);
1043
1044
1045
1046 boolean white = tryWhitespace ();
1047 c = readCh ();
1048 while (c != '/' && c != '>') {
1049 unread (c);
1050 if (!white)
1051 error ("need whitespace between attributes");
1052 parseAttribute (gi);
1053 white = tryWhitespace ();
1054 c = readCh ();
1055 }
1056
1057
1058 Enumeration atts = declaredAttributes (element);
1059 if (atts != null) {
1060 String aname;
1061 loop:
1062 while (atts.hasMoreElements ()) {
1063 aname = (String) atts.nextElement ();
1064
1065 for (int i = 0; i < tagAttributePos; i++) {
1066 if (tagAttributes [i] == aname) {
1067 continue loop;
1068 }
1069 }
1070
1071 String value = getAttributeDefaultValue (gi, aname);
1072
1073 if (value == null)
1074 continue;
1075 handler.attribute (aname, value, false);
1076 }
1077 }
1078
1079
1080
1081
1082 switch (c) {
1083 case '>':
1084 handler.startElement (gi);
1085 parseContent ();
1086 break;
1087 case '/':
1088 require ('>');
1089 handler.startElement (gi);
1090 handler.endElement (gi);
1091 break;
1092 }
1093
1094
1095 currentElement = oldElement;
1096 currentElementContent = oldElementContent;
1097 }
1098
1099
1100 /***
1101 * Parse an attribute assignment.
1102 * <pre>
1103 * [41] Attribute ::= Name Eq AttValue
1104 * </pre>
1105 * @param name The name of the attribute's element.
1106 * @see SAXDriver#attribute
1107 */
1108 private void parseAttribute (String name)
1109 throws Exception
1110 {
1111 String aname;
1112 String type;
1113 String value;
1114 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF;
1115
1116
1117 aname = readNmtoken (true);
1118 type = getAttributeType (name, aname);
1119
1120
1121 parseEq ();
1122
1123
1124
1125 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
1126 if (type == "CDATA" || type == null) {
1127 value = readLiteral (flags);
1128 } else {
1129 value = readLiteral (flags | LIT_NORMALIZE);
1130 }
1131 } else {
1132 if (type.equals("CDATA") || type == null) {
1133 value = readLiteral (flags);
1134 } else {
1135 value = readLiteral (flags | LIT_NORMALIZE);
1136 }
1137 }
1138
1139
1140 for (int i = 0; i < tagAttributePos; i++)
1141 if (aname.equals (tagAttributes [i]))
1142 error ("duplicate attribute", aname, null);
1143
1144
1145
1146 handler.attribute (aname, value, true);
1147 dataBufferPos = 0;
1148
1149
1150
1151 if (tagAttributePos == tagAttributes.length) {
1152 String newAttrib[] = new String [tagAttributes.length * 2];
1153 System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos);
1154 tagAttributes = newAttrib;
1155 }
1156 tagAttributes [tagAttributePos++] = aname;
1157 }
1158
1159
1160 /***
1161 * Parse an equals sign surrounded by optional whitespace.
1162 * <pre>
1163 * [25] Eq ::= S? '=' S?
1164 * </pre>
1165 */
1166 private void parseEq ()
1167 throws SAXException, IOException
1168 {
1169 skipWhitespace ();
1170 require ('=');
1171 skipWhitespace ();
1172 }
1173
1174
1175 /***
1176 * Parse an end tag.
1177 * <pre>
1178 * [42] ETag ::= '</' Name S? '>'
1179 * </pre>
1180 * <p>NOTE: parseContent () chains to here, we already read the
1181 * "</".
1182 */
1183 private void parseETag ()
1184 throws Exception
1185 {
1186 require (currentElement);
1187 skipWhitespace ();
1188 require ('>');
1189 handler.endElement (currentElement);
1190
1191
1192 }
1193
1194
1195 /***
1196 * Parse the content of an element.
1197 * <pre>
1198 * [43] content ::= (element | CharData | Reference
1199 * | CDSect | PI | Comment)*
1200 * [67] Reference ::= EntityRef | CharRef
1201 * </pre>
1202 * <p> NOTE: consumes ETtag.
1203 */
1204 private void parseContent ()
1205 throws Exception
1206 {
1207 char c;
1208
1209 while (true) {
1210
1211 parseCharData ();
1212
1213
1214 c = readCh ();
1215 switch (c) {
1216
1217 case '&':
1218 c = readCh ();
1219 if (c == '#') {
1220 parseCharRef ();
1221 } else {
1222 unread (c);
1223 parseEntityRef (true);
1224 }
1225 isDirtyCurrentElement = true;
1226 break;
1227
1228 case '<':
1229 dataBufferFlush ();
1230 c = readCh ();
1231 switch (c) {
1232 case '!':
1233 c = readCh ();
1234 switch (c) {
1235 case '-':
1236 require ('-');
1237 isDirtyCurrentElement = false;
1238 parseComment ();
1239 break;
1240 case '[':
1241 isDirtyCurrentElement = false;
1242 require ("CDATA[");
1243 handler.startCDATA ();
1244 inCDATA = true;
1245 parseCDSect ();
1246 inCDATA = false;
1247 handler.endCDATA ();
1248 break;
1249 default:
1250 error ("expected comment or CDATA section", c, null);
1251 break;
1252 }
1253 break;
1254
1255 case '?':
1256 isDirtyCurrentElement = false;
1257 parsePI ();
1258 break;
1259
1260 case '/':
1261 isDirtyCurrentElement = false;
1262 parseETag ();
1263 return;
1264
1265 default:
1266 isDirtyCurrentElement = false;
1267 unread (c);
1268 parseElement (false);
1269 break;
1270 }
1271 }
1272 }
1273
1274 }
1275
1276
1277 /***
1278 * Parse an element type declaration.
1279 * <pre>
1280 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
1281 * </pre>
1282 * <p> NOTE: the '<!ELEMENT' has already been read.
1283 */
1284 private void parseElementDecl ()
1285 throws Exception
1286 {
1287 String name;
1288
1289 requireWhitespace ();
1290
1291 name = readNmtoken (true);
1292
1293 requireWhitespace ();
1294
1295 parseContentspec (name);
1296
1297 skipWhitespace ();
1298 require ('>');
1299 }
1300
1301
1302 /***
1303 * Content specification.
1304 * <pre>
1305 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1306 * </pre>
1307 */
1308 private void parseContentspec (String name)
1309 throws Exception
1310 {
1311
1312 if (tryRead ("EMPTY")) {
1313 setElement (name, CONTENT_EMPTY, null, null);
1314 if (!skippedPE)
1315 handler.getDeclHandler ().elementDecl (name, "EMPTY");
1316 return;
1317 } else if (tryRead ("ANY")) {
1318 setElement (name, CONTENT_ANY, null, null);
1319 if (!skippedPE)
1320 handler.getDeclHandler ().elementDecl (name, "ANY");
1321 return;
1322 } else {
1323 String model;
1324 char saved [];
1325
1326 require ('(');
1327 saved = readBuffer;
1328 dataBufferAppend ('(');
1329 skipWhitespace ();
1330 if (tryRead ("#PCDATA")) {
1331 dataBufferAppend ("#PCDATA");
1332 parseMixed (saved);
1333 model = dataBufferToString ();
1334 setElement (name, CONTENT_MIXED, model, null);
1335 } else {
1336 parseElements (saved);
1337 model = dataBufferToString ();
1338 setElement (name, CONTENT_ELEMENTS, model, null);
1339 }
1340 if (!skippedPE)
1341 handler.getDeclHandler ().elementDecl (name, model);
1342 }
1343 }
1344
1345 /***
1346 * Parse an element-content model.
1347 * <pre>
1348 * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1349 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1350 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1351 * </pre>
1352 *
1353 * <p> NOTE: the opening '(' and S have already been read.
1354 *
1355 * @param saved Buffer for entity that should have the terminal ')'
1356 */
1357 private void parseElements (char saved [])
1358 throws Exception
1359 {
1360 char c;
1361 char sep;
1362
1363
1364 skipWhitespace ();
1365 parseCp ();
1366
1367
1368 skipWhitespace ();
1369 c = readCh ();
1370 switch (c) {
1371 case ')':
1372
1373 if (readBuffer != saved)
1374 handler.verror ("Illegal Group/PE nesting");
1375
1376 dataBufferAppend (')');
1377 c = readCh ();
1378 switch (c) {
1379 case '*':
1380 case '+':
1381 case '?':
1382 dataBufferAppend (c);
1383 break;
1384 default:
1385 unread (c);
1386 }
1387 return;
1388 case ',':
1389 case '|':
1390 sep = c;
1391 dataBufferAppend (c);
1392 break;
1393 default:
1394 error ("bad separator in content model", c, null);
1395 return;
1396 }
1397
1398
1399 while (true) {
1400 skipWhitespace ();
1401 parseCp ();
1402 skipWhitespace ();
1403 c = readCh ();
1404 if (c == ')') {
1405
1406 if (readBuffer != saved)
1407 handler.verror ("Illegal Group/PE nesting");
1408
1409 dataBufferAppend (')');
1410 break;
1411 } else if (c != sep) {
1412 error ("bad separator in content model", c, null);
1413 return;
1414 } else {
1415 dataBufferAppend (c);
1416 }
1417 }
1418
1419
1420 c = readCh ();
1421 switch (c) {
1422 case '?':
1423 case '*':
1424 case '+':
1425 dataBufferAppend (c);
1426 return;
1427 default:
1428 unread (c);
1429 return;
1430 }
1431 }
1432
1433
1434 /***
1435 * Parse a content particle.
1436 * <pre>
1437 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1438 * </pre>
1439 */
1440 private void parseCp ()
1441 throws Exception
1442 {
1443 if (tryRead ('(')) {
1444 dataBufferAppend ('(');
1445 parseElements (readBuffer);
1446 } else {
1447 dataBufferAppend (readNmtoken (true));
1448 char c = readCh ();
1449 switch (c) {
1450 case '?':
1451 case '*':
1452 case '+':
1453 dataBufferAppend (c);
1454 break;
1455 default:
1456 unread (c);
1457 break;
1458 }
1459 }
1460 }
1461
1462
1463 /***
1464 * Parse mixed content.
1465 * <pre>
1466 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1467 * | '(' S? ('#PCDATA') S? ')'
1468 * </pre>
1469 *
1470 * @param saved Buffer for entity that should have the terminal ')'
1471 */
1472 private void parseMixed (char saved [])
1473 throws Exception
1474 {
1475
1476 skipWhitespace ();
1477 if (tryRead (')')) {
1478
1479 if (readBuffer != saved)
1480 handler.verror ("Illegal Group/PE nesting");
1481
1482 dataBufferAppend (")*");
1483 tryRead ('*');
1484 return;
1485 }
1486
1487
1488 skipWhitespace ();
1489 while (!tryRead (")")) {
1490 require ('|');
1491 dataBufferAppend ('|');
1492 skipWhitespace ();
1493 dataBufferAppend (readNmtoken (true));
1494 skipWhitespace ();
1495 }
1496
1497
1498 if (readBuffer != saved)
1499 handler.verror ("Illegal Group/PE nesting");
1500
1501 require ('*');
1502 dataBufferAppend (")*");
1503 }
1504
1505
1506 /***
1507 * Parse an attribute list declaration.
1508 * <pre>
1509 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
1510 * </pre>
1511 * <p>NOTE: the '<!ATTLIST' has already been read.
1512 */
1513 private void parseAttlistDecl ()
1514 throws Exception
1515 {
1516 String elementName;
1517
1518 requireWhitespace ();
1519 elementName = readNmtoken (true);
1520 boolean white = tryWhitespace ();
1521 while (!tryRead ('>')) {
1522 if (!white)
1523 error ("whitespace required before attribute definition");
1524 parseAttDef (elementName);
1525 white = tryWhitespace ();
1526 }
1527 }
1528
1529
1530 /***
1531 * Parse a single attribute definition.
1532 * <pre>
1533 * [53] AttDef ::= S Name S AttType S DefaultDecl
1534 * </pre>
1535 */
1536 private void parseAttDef (String elementName)
1537 throws Exception
1538 {
1539 String name;
1540 String type;
1541 String enumer = null;
1542
1543
1544 name = readNmtoken (true);
1545
1546
1547 requireWhitespace ();
1548 type = readAttType ();
1549
1550
1551 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
1552 if ("ENUMERATION" == type || "NOTATION" == type)
1553 enumer = dataBufferToString ();
1554 } else {
1555 if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1556 enumer = dataBufferToString ();
1557 }
1558
1559
1560 requireWhitespace ();
1561 parseDefault (elementName, name, type, enumer);
1562 }
1563
1564
1565 /***
1566 * Parse the attribute type.
1567 * <pre>
1568 * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1569 * [55] StringType ::= 'CDATA'
1570 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1571 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1572 * [57] EnumeratedType ::= NotationType | Enumeration
1573 * </pre>
1574 */
1575 private String readAttType ()
1576 throws Exception
1577 {
1578 if (tryRead ('(')) {
1579 parseEnumeration (false);
1580 return "ENUMERATION";
1581 } else {
1582 String typeString = readNmtoken (true);
1583 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
1584 if ("NOTATION" == typeString) {
1585 parseNotationType ();
1586 return typeString;
1587 } else if ("CDATA" == typeString
1588 || "ID" == typeString
1589 || "IDREF" == typeString
1590 || "IDREFS" == typeString
1591 || "ENTITY" == typeString
1592 || "ENTITIES" == typeString
1593 || "NMTOKEN" == typeString
1594 || "NMTOKENS" == typeString)
1595 return typeString;
1596 } else {
1597 if ("NOTATION".equals(typeString)) {
1598 parseNotationType ();
1599 return typeString;
1600 } else if ("CDATA".equals(typeString)
1601 || "ID".equals(typeString)
1602 || "IDREF".equals(typeString)
1603 || "IDREFS".equals(typeString)
1604 || "ENTITY".equals(typeString)
1605 || "ENTITIES".equals(typeString)
1606 || "NMTOKEN".equals(typeString)
1607 || "NMTOKENS".equals(typeString))
1608 return typeString;
1609 }
1610 error ("illegal attribute type", typeString, null);
1611 return null;
1612 }
1613 }
1614
1615
1616 /***
1617 * Parse an enumeration.
1618 * <pre>
1619 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1620 * </pre>
1621 * <p>NOTE: the '(' has already been read.
1622 */
1623 private void parseEnumeration (boolean isNames)
1624 throws Exception
1625 {
1626 dataBufferAppend ('(');
1627
1628
1629 skipWhitespace ();
1630 dataBufferAppend (readNmtoken (isNames));
1631
1632 skipWhitespace ();
1633 while (!tryRead (')')) {
1634 require ('|');
1635 dataBufferAppend ('|');
1636 skipWhitespace ();
1637 dataBufferAppend (readNmtoken (isNames));
1638 skipWhitespace ();
1639 }
1640 dataBufferAppend (')');
1641 }
1642
1643
1644 /***
1645 * Parse a notation type for an attribute.
1646 * <pre>
1647 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1648 * (S? '|' S? name)* S? ')'
1649 * </pre>
1650 * <p>NOTE: the 'NOTATION' has already been read
1651 */
1652 private void parseNotationType ()
1653 throws Exception
1654 {
1655 requireWhitespace ();
1656 require ('(');
1657
1658 parseEnumeration (true);
1659 }
1660
1661
1662 /***
1663 * Parse the default value for an attribute.
1664 * <pre>
1665 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1666 * | (('#FIXED' S)? AttValue)
1667 * </pre>
1668 */
1669 private void parseDefault (
1670 String elementName,
1671 String name,
1672 String type,
1673 String enumer
1674 ) throws Exception
1675 {
1676 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1677 String value = null;
1678 int flags = LIT_ATTRIBUTE;
1679 boolean saved = expandPE;
1680 String defaultType = null;
1681
1682
1683
1684
1685
1686 if (!skippedPE) {
1687 flags |= LIT_ENTITY_REF;
1688 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
1689 if ("CDATA" != type)
1690 flags |= LIT_NORMALIZE;
1691 } else {
1692 if (!"CDATA".equals(type))
1693 flags |= LIT_NORMALIZE;
1694 }
1695 }
1696
1697 expandPE = false;
1698 if (tryRead ('#')) {
1699 if (tryRead ("FIXED")) {
1700 defaultType = "#FIXED";
1701 valueType = ATTRIBUTE_DEFAULT_FIXED;
1702 requireWhitespace ();
1703 value = readLiteral (flags);
1704 } else if (tryRead ("REQUIRED")) {
1705 defaultType = "#REQUIRED";
1706 valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1707 } else if (tryRead ("IMPLIED")) {
1708 defaultType = "#IMPLIED";
1709 valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1710 } else {
1711 error ("illegal keyword for attribute default value");
1712 }
1713 } else
1714 value = readLiteral (flags);
1715 expandPE = saved;
1716 setAttribute (elementName, name, type, enumer, value, valueType);
1717 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
1718 if ("ENUMERATION" == type)
1719 type = enumer;
1720 else if ("NOTATION" == type)
1721 type = "NOTATION " + enumer;
1722 } else {
1723 if ("ENUMERATION".equals(type))
1724 type = enumer;
1725 else if ("NOTATION".equals(type))
1726 type = "NOTATION " + enumer;
1727 }
1728 if (!skippedPE) handler.getDeclHandler ()
1729 .attributeDecl (elementName, name, type, defaultType, value);
1730 }
1731
1732
1733 /***
1734 * Parse a conditional section.
1735 * <pre>
1736 * [61] conditionalSect ::= includeSect || ignoreSect
1737 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '['
1738 * extSubsetDecl ']]>'
1739 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '['
1740 * ignoreSectContents* ']]>'
1741 * [64] ignoreSectContents ::= Ignore
1742 * ('<![' ignoreSectContents* ']]>' Ignore )*
1743 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* )
1744 * </pre>
1745 * <p> NOTE: the '>![' has already been read.
1746 */
1747 private void parseConditionalSect (char saved [])
1748 throws Exception
1749 {
1750 skipWhitespace ();
1751 if (tryRead ("INCLUDE")) {
1752 skipWhitespace ();
1753 require ('[');
1754
1755 if (readBuffer != saved)
1756 handler.verror ("Illegal Conditional Section/PE nesting");
1757 skipWhitespace ();
1758 while (!tryRead ("]]>")) {
1759 parseMarkupdecl ();
1760 skipWhitespace ();
1761 }
1762 } else if (tryRead ("IGNORE")) {
1763 skipWhitespace ();
1764 require ('[');
1765
1766 if (readBuffer != saved)
1767 handler.verror ("Illegal Conditional Section/PE nesting");
1768 int nesting = 1;
1769 char c;
1770 expandPE = false;
1771 for (int nest = 1; nest > 0;) {
1772 c = readCh ();
1773 switch (c) {
1774 case '<':
1775 if (tryRead ("![")) {
1776 nest++;
1777 }
1778 case ']':
1779 if (tryRead ("]>")) {
1780 nest--;
1781 }
1782 }
1783 }
1784 expandPE = true;
1785 } else {
1786 error ("conditional section must begin with INCLUDE or IGNORE");
1787 }
1788 }
1789
1790 private void parseCharRef ()
1791 throws SAXException, IOException
1792 {
1793 parseCharRef (true
1794 }
1795
1796 /***
1797 * Try to read a character reference without consuming data from buffer.
1798 * <pre>
1799 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1800 * </pre>
1801 * <p>NOTE: the '&#' has already been read.
1802 */
1803 private void tryReadCharRef ()
1804 throws SAXException, IOException
1805 {
1806 int value = 0;
1807 char c;
1808
1809 if (tryRead ('x')) {
1810 loop1:
1811 while (true) {
1812 c = readCh ();
1813 int n;
1814 switch (c) {
1815 case '0': case '1': case '2': case '3': case '4':
1816 case '5': case '6': case '7': case '8': case '9':
1817 n = c - '0';
1818 break;
1819 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1820 n = (c - 'a') + 10;
1821 break;
1822 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1823 n = (c - 'A') + 10;
1824 break;
1825 case ';':
1826 break loop1;
1827 default:
1828 error ("illegal character in character reference", c, null);
1829 break loop1;
1830 }
1831 value *= 16;
1832 value += n;
1833 }
1834 } else {
1835 loop2:
1836 while (true) {
1837 c = readCh ();
1838 switch (c) {
1839 case '0': case '1': case '2': case '3': case '4':
1840 case '5': case '6': case '7': case '8': case '9':
1841 value *= 10;
1842 value += c - '0';
1843 break;
1844 case ';':
1845 break loop2;
1846 default:
1847 error ("illegal character in character reference", c, null);
1848 break loop2;
1849 }
1850 }
1851 }
1852
1853
1854 if ((value < 0x0020
1855 && ! (value == '\n' || value == '\t' || value == '\r'))
1856 || (value >= 0xD800 && value <= 0xDFFF)
1857 || value == 0xFFFE || value == 0xFFFF
1858 || value > 0x0010ffff)
1859 error ("illegal XML character reference U+"
1860 + Integer.toHexString (value));
1861
1862
1863
1864 if (value > 0x0010ffff) {
1865
1866 error ("character reference " + value + " is too large for UTF-16",
1867 new Integer (value).toString (), null);
1868 }
1869
1870 }
1871
1872 /***
1873 * Read and interpret a character reference.
1874 * <pre>
1875 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1876 * </pre>
1877 * <p>NOTE: the '&#' has already been read.
1878 */
1879 private void parseCharRef (boolean doFlush)
1880 throws SAXException, IOException
1881 {
1882 int value = 0;
1883 char c;
1884
1885 if (tryRead ('x')) {
1886 loop1:
1887 while (true) {
1888 c = readCh ();
1889 int n;
1890 switch (c) {
1891 case '0': case '1': case '2': case '3': case '4':
1892 case '5': case '6': case '7': case '8': case '9':
1893 n = c - '0';
1894 break;
1895 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1896 n = (c - 'a') + 10;
1897 break;
1898 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1899 n = (c - 'A') + 10;
1900 break;
1901 case ';':
1902 break loop1;
1903 default:
1904 error ("illegal character in character reference", c, null);
1905 break loop1;
1906 }
1907 value *= 16;
1908 value += n;
1909 }
1910 } else {
1911 loop2:
1912 while (true) {
1913 c = readCh ();
1914 switch (c) {
1915 case '0': case '1': case '2': case '3': case '4':
1916 case '5': case '6': case '7': case '8': case '9':
1917 value *= 10;
1918 value += c - '0';
1919 break;
1920 case ';':
1921 break loop2;
1922 default:
1923 error ("illegal character in character reference", c, null);
1924 break loop2;
1925 }
1926 }
1927 }
1928
1929
1930 if ((value < 0x0020
1931 && ! (value == '\n' || value == '\t' || value == '\r'))
1932 || (value >= 0xD800 && value <= 0xDFFF)
1933 || value == 0xFFFE || value == 0xFFFF
1934 || value > 0x0010ffff)
1935 error ("illegal XML character reference U+"
1936 + Integer.toHexString (value));
1937
1938
1939
1940 if (value <= 0x0000ffff) {
1941
1942 dataBufferAppend ((char) value);
1943 } else if (value <= 0x0010ffff) {
1944 value -= 0x10000;
1945
1946 dataBufferAppend ((char) (0xd800 | (value >> 10)));
1947 dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff)));
1948 } else {
1949
1950 error ("character reference " + value + " is too large for UTF-16",
1951 new Integer (value).toString (), null);
1952 }
1953 if (doFlush) dataBufferFlush ();
1954 }
1955
1956
1957 /***
1958 * Parse and expand an entity reference.
1959 * <pre>
1960 * [68] EntityRef ::= '&' Name ';'
1961 * </pre>
1962 * <p>NOTE: the '&' has already been read.
1963 * @param externalAllowed External entities are allowed here.
1964 */
1965 private void parseEntityRef (boolean externalAllowed)
1966 throws SAXException, IOException
1967 {
1968 String name;
1969
1970 name = readNmtoken (true);
1971 require (';');
1972 switch (getEntityType (name)) {
1973 case ENTITY_UNDECLARED:
1974
1975
1976
1977
1978 String message;
1979
1980 message = "reference to undeclared general entity " + name;
1981 if (skippedPE && !docIsStandalone) {
1982 handler.verror (message);
1983
1984 if (externalAllowed)
1985 handler.skippedEntity (name);
1986 } else
1987 error (message);
1988 break;
1989 case ENTITY_INTERNAL:
1990 pushString (name, getEntityValue (name));
1991
1992
1993
1994 char t = readCh ();
1995 unread (t);
1996 int bufferPosMark = readBufferPos;
1997
1998 int end = readBufferPos + getEntityValue (name).length();
1999 for(int k = readBufferPos; k < end; k++){
2000 t = readCh ();
2001 if (t == '&'){
2002 t = readCh ();
2003 if (t == '#'){
2004
2005 tryReadCharRef ();
2006
2007
2008 if (readBufferPos >= end)
2009 break;
2010 k = readBufferPos;
2011 continue;
2012 }
2013 else if (Character.isLetter(t)){
2014
2015 unread (t);
2016 readNmtoken (true);
2017 require (';');
2018
2019
2020 if (readBufferPos >= end)
2021 break;
2022 k = readBufferPos;
2023 continue;
2024 }
2025 error(" malformed entity reference");
2026 }
2027
2028 }
2029 readBufferPos = bufferPosMark;
2030 break;
2031 case ENTITY_TEXT:
2032 if (externalAllowed) {
2033 pushURL (false, name, getEntityIds (name),
2034 null, null, null, true);
2035 } else {
2036 error ("reference to external entity in attribute value.",
2037 name, null);
2038 }
2039 break;
2040 case ENTITY_NDATA:
2041 if (externalAllowed) {
2042 error ("unparsed entity reference in content", name, null);
2043 } else {
2044 error ("reference to external entity in attribute value.",
2045 name, null);
2046 }
2047 break;
2048 default:
2049 throw new RuntimeException ();
2050 }
2051 }
2052
2053
2054 /***
2055 * Parse and expand a parameter entity reference.
2056 * <pre>
2057 * [69] PEReference ::= '%' Name ';'
2058 * </pre>
2059 * <p>NOTE: the '%' has already been read.
2060 */
2061 private void parsePEReference ()
2062 throws SAXException, IOException
2063 {
2064 String name;
2065
2066 name = "%" + readNmtoken (true);
2067 require (';');
2068 switch (getEntityType (name)) {
2069 case ENTITY_UNDECLARED:
2070
2071 handler.verror ("reference to undeclared parameter entity " + name);
2072
2073
2074
2075 break;
2076 case ENTITY_INTERNAL:
2077 if (inLiteral)
2078 pushString (name, getEntityValue (name));
2079 else
2080 pushString (name, ' ' + getEntityValue (name) + ' ');
2081 break;
2082 case ENTITY_TEXT:
2083 if (!inLiteral)
2084 pushString (null, " ");
2085 pushURL (true, name, getEntityIds (name), null, null, null, true);
2086 if (!inLiteral)
2087 pushString (null, " ");
2088 break;
2089 }
2090 }
2091
2092 /***
2093 * Parse an entity declaration.
2094 * <pre>
2095 * [70] EntityDecl ::= GEDecl | PEDecl
2096 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
2097 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
2098 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2099 * [74] PEDef ::= EntityValue | ExternalID
2100 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2101 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2102 * [76] NDataDecl ::= S 'NDATA' S Name
2103 * </pre>
2104 * <p>NOTE: the '<!ENTITY' has already been read.
2105 */
2106 private void parseEntityDecl ()
2107 throws Exception
2108 {
2109 boolean peFlag = false;
2110 int flags = 0;
2111
2112
2113 expandPE = false;
2114 requireWhitespace ();
2115 if (tryRead ('%')) {
2116 peFlag = true;
2117 requireWhitespace ();
2118 }
2119 expandPE = true;
2120
2121
2122
2123 String name = readNmtoken (true);
2124
2125 if (name.indexOf(':') >= 0)
2126 error ("Illegal character(':') in entity name ", name, null);
2127 if (peFlag) {
2128 name = "%" + name;
2129 }
2130
2131
2132 requireWhitespace ();
2133 char c = readCh ();
2134 unread (c);
2135 if (c == '"' || c == '\'') {
2136
2137
2138 String value = readLiteral (flags);
2139 setInternalEntity (name, value);
2140 } else {
2141
2142 String ids [] = readExternalIds (false, false);
2143
2144
2145 boolean white = tryWhitespace ();
2146 if (!peFlag && tryRead ("NDATA")) {
2147 if (!white)
2148 error ("whitespace required before NDATA");
2149 requireWhitespace ();
2150 String notationName = readNmtoken (true);
2151 if (!skippedPE) {
2152 setExternalEntity (name, ENTITY_NDATA, ids, notationName);
2153 handler.unparsedEntityDecl (name, ids, notationName);
2154 }
2155 } else if (!skippedPE) {
2156 setExternalEntity (name, ENTITY_TEXT, ids, null);
2157 handler.getDeclHandler ()
2158 .externalEntityDecl (name, ids [0],
2159 handler.resolveURIs ()
2160
2161
2162 ? handler.absolutize (ids [2], ids [1], false)
2163 : ids [1]);
2164 }
2165 }
2166
2167
2168 skipWhitespace ();
2169 require ('>');
2170 }
2171
2172
2173 /***
2174 * Parse a notation declaration.
2175 * <pre>
2176 * [82] NotationDecl ::= '<!NOTATION' S Name S
2177 * (ExternalID | PublicID) S? '>'
2178 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2179 * </pre>
2180 * <P>NOTE: the '<!NOTATION' has already been read.
2181 */
2182 private void parseNotationDecl ()
2183 throws Exception
2184 {
2185 String nname, ids[];
2186
2187
2188 requireWhitespace ();
2189 nname = readNmtoken (true);
2190
2191 if (nname.indexOf(':') >= 0)
2192 error ("Illegal character(':') in notation name ", nname, null);
2193 requireWhitespace ();
2194
2195
2196 ids = readExternalIds (true, false);
2197
2198
2199 setNotation (nname, ids);
2200
2201 skipWhitespace ();
2202 require ('>');
2203 }
2204
2205
2206 /***
2207 * Parse character data.
2208 * <pre>
2209 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2210 * </pre>
2211 */
2212 private void parseCharData ()
2213 throws Exception
2214 {
2215 char c;
2216 int state = 0;
2217 boolean pureWhite = false;
2218
2219
2220
2221
2222 if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2223 pureWhite = true;
2224
2225
2226
2227 while (true) {
2228 int lineAugment = 0;
2229 int columnAugment = 0;
2230 int i;
2231
2232 loop:
2233 for (i = readBufferPos; i < readBufferLength; i++) {
2234 switch (c = readBuffer [i]) {
2235 case '\n':
2236 lineAugment++;
2237 columnAugment = 0;
2238
2239 break;
2240 case '\r':
2241 case '\t':
2242 case ' ':
2243
2244 columnAugment++;
2245 break;
2246 case '&':
2247 case '<':
2248 columnAugment++;
2249
2250
2251 state = 1;
2252 break loop;
2253 case ']':
2254
2255
2256 pureWhite = false;
2257 if ((i + 2) < readBufferLength) {
2258 if (readBuffer [i + 1] == ']'
2259 && readBuffer [i + 2] == '>') {
2260
2261 state = 2;
2262 break loop;
2263 }
2264 } else {
2265
2266 }
2267 columnAugment++;
2268 break;
2269 default:
2270 if ((c < 0x0020 || c > 0xFFFD)
2271 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
2272 && xmlVersion == XML_11))
2273 error ("illegal XML character U+"
2274 + Integer.toHexString (c));
2275
2276 pureWhite = false;
2277 columnAugment++;
2278 }
2279 }
2280
2281
2282 if (lineAugment > 0) {
2283 line += lineAugment;
2284 column = columnAugment;
2285 } else {
2286 column += columnAugment;
2287 }
2288
2289
2290 int length = i - readBufferPos;
2291
2292 if (length != 0) {
2293 if (pureWhite)
2294 handler.ignorableWhitespace (readBuffer,
2295 readBufferPos, length);
2296 else
2297 handler.charData (readBuffer, readBufferPos, length);
2298 readBufferPos = i;
2299 }
2300
2301 if (state != 0)
2302 break;
2303
2304
2305
2306 unread (readCh ());
2307 }
2308 if (!pureWhite)
2309 isDirtyCurrentElement = true;
2310
2311 if (state != 1)
2312 error ("character data may not contain ']]>'");
2313 }
2314
2315
2316
2317
2318
2319
2320 /***
2321 * Require whitespace characters.
2322 */
2323 private void requireWhitespace ()
2324 throws SAXException, IOException
2325 {
2326 char c = readCh ();
2327 if (isWhitespace (c)) {
2328 skipWhitespace ();
2329 } else {
2330 error ("whitespace required", c, null);
2331 }
2332 }
2333
2334
2335 /***
2336 * Skip whitespace characters.
2337 * <pre>
2338 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2339 * </pre>
2340 */
2341 private void skipWhitespace ()
2342 throws SAXException, IOException
2343 {
2344
2345
2346
2347
2348 if (USE_CHEATS) {
2349 int lineAugment = 0;
2350 int columnAugment = 0;
2351
2352 loop:
2353 for (int i = readBufferPos; i < readBufferLength; i++) {
2354 switch (readBuffer [i]) {
2355 case ' ':
2356 case '\t':
2357 case '\r':
2358 columnAugment++;
2359 break;
2360 case '\n':
2361 lineAugment++;
2362 columnAugment = 0;
2363 break;
2364 case '%':
2365 if (expandPE)
2366 break loop;
2367
2368 default:
2369 readBufferPos = i;
2370 if (lineAugment > 0) {
2371 line += lineAugment;
2372 column = columnAugment;
2373 } else {
2374 column += columnAugment;
2375 }
2376 return;
2377 }
2378 }
2379 }
2380
2381
2382 char c = readCh ();
2383 while (isWhitespace (c)) {
2384 c = readCh ();
2385 }
2386 unread (c);
2387 }
2388
2389
2390 /***
2391 * Read a name or (when parsing an enumeration) name token.
2392 * <pre>
2393 * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2394 * [7] Nmtoken ::= (NameChar)+
2395 * </pre>
2396 */
2397 private String readNmtoken (boolean isName)
2398 throws SAXException, IOException
2399 {
2400 char c;
2401
2402 if (USE_CHEATS) {
2403 loop:
2404 for (int i = readBufferPos; i < readBufferLength; i++) {
2405 c = readBuffer [i];
2406 switch (c) {
2407 case '%':
2408 if (expandPE)
2409 break loop;
2410
2411
2412
2413 case '<': case '>': case '&':
2414 case ',': case '|': case '*': case '+': case '?':
2415 case ')':
2416 case '=':
2417 case '\'': case '"':
2418 case '[':
2419 case ' ': case '\t': case '\r': case '\n':
2420 case ';':
2421 case '/':
2422 int start = readBufferPos;
2423 if (i == start)
2424 error ("name expected", readBuffer [i], null);
2425 readBufferPos = i;
2426 return intern (readBuffer, start, i - start);
2427
2428 default:
2429
2430
2431
2432
2433
2434 if (isName && i == readBufferPos){
2435 char c2 = (char) (c & 0x00f0);
2436 switch (c & 0xff00){
2437
2438 case 0x0100:
2439 switch (c2){
2440 case 0x0030:
2441 if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2442 error ("Not a name start character, U+"
2443 + Integer.toHexString (c));
2444 break;
2445 case 0x0040:
2446 if (c == 0x0140 || c == 0x0149)
2447 error ("Not a name start character, U+"
2448 + Integer.toHexString (c));
2449 break;
2450 case 0x00c0:
2451 if (c == 0x01c4 || c == 0x01cc)
2452 error ("Not a name start character, U+"
2453 + Integer.toHexString (c));
2454 break;
2455 case 0x00f0:
2456 if (c == 0x01f1 || c == 0x01f3)
2457 error ("Not a name start character, U+"
2458 + Integer.toHexString (c));
2459 break;
2460 case 0x00b0:
2461 if (c == 0x01f1 || c == 0x01f3)
2462 error ("Not a name start character, U+"
2463 + Integer.toHexString (c));
2464 break;
2465 default:
2466 if (c == 0x017f)
2467 error ("Not a name start character, U+"
2468 + Integer.toHexString (c));
2469 }
2470
2471 break;
2472
2473 case 0x1100:
2474 switch (c2){
2475 case 0x0000:
2476 if (c == 0x1104 || c == 0x1108 ||
2477 c == 0x110a || c == 0x110d)
2478 error ("Not a name start character, U+"
2479 + Integer.toHexString (c));
2480 break;
2481 case 0x0030:
2482 if (c == 0x113b || c == 0x113f)
2483 error ("Not a name start character, U+"
2484 + Integer.toHexString (c));
2485 break;
2486 case 0x0040:
2487 if (c == 0x1141 || c == 0x114d
2488 || c == 0x114f )
2489 error ("Not a name start character, U+"
2490 + Integer.toHexString (c));
2491 break;
2492 case 0x0050:
2493 if (c == 0x1151 || c == 0x1156)
2494 error ("Not a name start character, U+"
2495 + Integer.toHexString (c));
2496 break;
2497 case 0x0060:
2498 if (c == 0x1162 || c == 0x1164
2499 || c == 0x1166 || c == 0x116b
2500 || c == 0x116f)
2501 error ("Not a name start character, U+"
2502 + Integer.toHexString (c));
2503 break;
2504 case 0x00b0:
2505 if (c == 0x11b6 || c == 0x11b9
2506 || c == 0x11bb || c == 0x116f)
2507 error ("Not a name start character, U+"
2508 + Integer.toHexString (c));
2509 break;
2510 default:
2511 if (c == 0x1174 || c == 0x119f
2512 || c == 0x11ac || c == 0x11c3
2513 || c == 0x11f1)
2514 error ("Not a name start character, U+"
2515 + Integer.toHexString (c));
2516 }
2517 break;
2518 default:
2519 if (c == 0x0e46 || c == 0x1011
2520 || c == 0x212f || c == 0x0587
2521 || c == 0x0230 )
2522 error ("Not a name start character, U+"
2523 + Integer.toHexString (c));
2524 }
2525 }
2526
2527
2528 if (i == readBufferPos && isName) {
2529 if (!Character.isUnicodeIdentifierStart (c)
2530 && c != ':' && c != '_')
2531 error ("Not a name start character, U+"
2532 + Integer.toHexString (c));
2533 } else if (!Character.isUnicodeIdentifierPart (c)
2534 && c != '-' && c != ':' && c != '_' && c != '.'
2535 && !isExtender (c))
2536 error ("Not a name character, U+"
2537 + Integer.toHexString (c));
2538 }
2539 }
2540 }
2541
2542 nameBufferPos = 0;
2543
2544
2545 loop:
2546 while (true) {
2547 c = readCh ();
2548 switch (c) {
2549 case '%':
2550 case '<': case '>': case '&':
2551 case ',': case '|': case '*': case '+': case '?':
2552 case ')':
2553 case '=':
2554 case '\'': case '"':
2555 case '[':
2556 case ' ': case '\t': case '\n': case '\r':
2557 case ';':
2558 case '/':
2559 unread (c);
2560 if (nameBufferPos == 0) {
2561 error ("name expected");
2562 }
2563
2564 if (isName
2565 && !Character.isUnicodeIdentifierStart (
2566 nameBuffer [0])
2567 && ":_".indexOf (nameBuffer [0]) == -1)
2568 error ("Not a name start character, U+"
2569 + Integer.toHexString (nameBuffer [0]));
2570 String s = intern (nameBuffer, 0, nameBufferPos);
2571 nameBufferPos = 0;
2572 return s;
2573 default:
2574
2575
2576 if ((nameBufferPos != 0 || !isName)
2577 && !Character.isUnicodeIdentifierPart (c)
2578 && ":-_.".indexOf (c) == -1
2579 && !isExtender (c))
2580 error ("Not a name character, U+"
2581 + Integer.toHexString (c));
2582 if (nameBufferPos >= nameBuffer.length)
2583 nameBuffer =
2584 (char[]) extendArray (nameBuffer,
2585 nameBuffer.length, nameBufferPos);
2586 nameBuffer [nameBufferPos++] = c;
2587 }
2588 }
2589 }
2590
2591 private static boolean isExtender (char c)
2592 {
2593
2594 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2595 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2596 || (c >= 0x3031 && c <= 0x3035)
2597 || (c >= 0x309d && c <= 0x309e)
2598 || (c >= 0x30fc && c <= 0x30fe);
2599 }
2600
2601
2602 /***
2603 * Read a literal. With matching single or double quotes as
2604 * delimiters (and not embedded!) this is used to parse:
2605 * <pre>
2606 * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ...
2607 * [10] AttValue ::= ... ([^<&] | Reference)* ...
2608 * [11] SystemLiteral ::= ... (URLchar - "'")* ...
2609 * [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2610 * </pre>
2611 * as well as the quoted strings in XML and text declarations
2612 * (for version, encoding, and standalone) which have their
2613 * own constraints.
2614 */
2615 private String readLiteral (int flags)
2616 throws SAXException, IOException
2617 {
2618 char delim, c;
2619 int startLine = line;
2620 boolean saved = expandPE;
2621 boolean savedReport = doReport;
2622
2623
2624 delim = readCh ();
2625 if (delim != '"' && delim != '\'') {
2626 error ("expected '\"' or \"'\"", delim, null);
2627 return null;
2628 }
2629 inLiteral = true;
2630 if ((flags & LIT_DISABLE_PE) != 0)
2631 expandPE = false;
2632 doReport = false;
2633
2634
2635
2636
2637 char ourBuf [] = readBuffer;
2638
2639
2640 try {
2641 c = readCh ();
2642 boolean ampRead = false;
2643 loop:
2644 while (! (c == delim && readBuffer == ourBuf)) {
2645 switch (c) {
2646
2647
2648 case '\n':
2649 case '\r':
2650 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
2651 c = ' ';
2652 break;
2653 case '\t':
2654 if ((flags & LIT_ATTRIBUTE) != 0)
2655 c = ' ';
2656 break;
2657 case '&':
2658 c = readCh ();
2659
2660
2661 if (c == '#') {
2662 if ((flags & LIT_DISABLE_CREF) != 0) {
2663 dataBufferAppend ('&');
2664 break;
2665 }
2666 parseCharRef (false
2667
2668
2669
2670
2671
2672
2673 } else {
2674 unread (c);
2675
2676 if ((flags & LIT_ENTITY_REF) > 0) {
2677 parseEntityRef (false);
2678 if (String.valueOf (readBuffer).equals("&"))
2679 ampRead = true;
2680
2681 } else if ((flags & LIT_DISABLE_EREF) != 0) {
2682 dataBufferAppend ('&');
2683
2684
2685 } else {
2686 String name = readNmtoken (true);
2687 require (';');
2688 dataBufferAppend ('&');
2689 dataBufferAppend (name);
2690 dataBufferAppend (';');
2691 }
2692 }
2693 c = readCh ();
2694 continue loop;
2695
2696 case '<':
2697
2698
2699 if ((flags & LIT_ATTRIBUTE) != 0)
2700 error ("attribute values may not contain '<'");
2701 break;
2702
2703
2704
2705 default:
2706 break;
2707 }
2708 dataBufferAppend (c);
2709 c = readCh ();
2710 }
2711 } catch (EOFException e) {
2712 error ("end of input while looking for delimiter (started on line "
2713 + startLine + ')', null, new Character (delim).toString ());
2714 }
2715 inLiteral = false;
2716 expandPE = saved;
2717 doReport = savedReport;
2718
2719
2720 if ((flags & LIT_NORMALIZE) > 0) {
2721 dataBufferNormalize ();
2722 }
2723
2724
2725 return dataBufferToString ();
2726 }
2727
2728
2729 /***
2730 * Try reading external identifiers.
2731 * A system identifier is not required for notations.
2732 * @param inNotation Are we parsing a notation decl?
2733 * @param isSubset Parsing external subset decl (may be omitted)?
2734 * @return A three-member String array containing the identifiers,
2735 * or nulls. Order: public, system, baseURI.
2736 */
2737 private String[] readExternalIds (boolean inNotation, boolean isSubset)
2738 throws Exception
2739 {
2740 char c;
2741 String ids[] = new String [3];
2742 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
2743
2744 if (tryRead ("PUBLIC")) {
2745 requireWhitespace ();
2746 ids [0] = readLiteral (LIT_NORMALIZE | LIT_PUBID | flags);
2747 if (inNotation) {
2748 skipWhitespace ();
2749 c = readCh ();
2750 unread (c);
2751 if (c == '"' || c == '\'') {
2752 ids [1] = readLiteral (flags);
2753 }
2754 } else {
2755 requireWhitespace ();
2756 ids [1] = readLiteral (flags);
2757 }
2758
2759 for (int i = 0; i < ids [0].length (); i++) {
2760 c = ids [0].charAt (i);
2761 if (c >= 'a' && c <= 'z')
2762 continue;
2763 if (c >= 'A' && c <= 'Z')
2764 continue;
2765 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1)
2766 continue;
2767 error ("illegal PUBLIC id character U+"
2768 + Integer.toHexString (c));
2769 }
2770 } else if (tryRead ("SYSTEM")) {
2771 requireWhitespace ();
2772 ids [1] = readLiteral (flags);
2773 } else if (!isSubset)
2774 error ("missing SYSTEM or PUBLIC keyword");
2775
2776 if (ids [1] != null) {
2777 if (ids [1].indexOf ('#') != -1)
2778 handler.verror ("SYSTEM id has a URI fragment: " + ids [1]);
2779 ids [2] = handler.getSystemId ();
2780 if (ids [2] == null)
2781 handler.warn ("No base URI; hope URI is absolute: "
2782 + ids [1]);
2783 }
2784
2785 return ids;
2786 }
2787
2788
2789 /***
2790 * Test if a character is whitespace.
2791 * <pre>
2792 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2793 * </pre>
2794 * @param c The character to test.
2795 * @return true if the character is whitespace.
2796 */
2797 private final boolean isWhitespace (char c)
2798 {
2799 if (c > 0x20)
2800 return false;
2801 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
2802 return true;
2803 return false;
2804 }
2805
2806
2807
2808
2809
2810
2811
2812 /***
2813 * Add a character to the data buffer.
2814 */
2815 private void dataBufferAppend (char c)
2816 {
2817
2818 if (dataBufferPos >= dataBuffer.length)
2819 dataBuffer =
2820 (char[]) extendArray (dataBuffer,
2821 dataBuffer.length, dataBufferPos);
2822 dataBuffer [dataBufferPos++] = c;
2823 }
2824
2825
2826 /***
2827 * Add a string to the data buffer.
2828 */
2829 private void dataBufferAppend (String s)
2830 {
2831 dataBufferAppend (s.toCharArray (), 0, s.length ());
2832 }
2833
2834
2835 /***
2836 * Append (part of) a character array to the data buffer.
2837 */
2838 private void dataBufferAppend (char ch[], int start, int length)
2839 {
2840 dataBuffer = (char[])
2841 extendArray (dataBuffer, dataBuffer.length,
2842 dataBufferPos + length);
2843
2844 System.arraycopy (ch, start, dataBuffer, dataBufferPos, length);
2845 dataBufferPos += length;
2846 }
2847
2848
2849 /***
2850 * Normalise space characters in the data buffer.
2851 */
2852 private void dataBufferNormalize ()
2853 {
2854 int i = 0;
2855 int j = 0;
2856 int end = dataBufferPos;
2857
2858
2859 while (j < end && dataBuffer [j] == ' ') {
2860 j++;
2861 }
2862
2863
2864 while (end > j && dataBuffer [end - 1] == ' ') {
2865 end --;
2866 }
2867
2868
2869 while (j < end) {
2870
2871 char c = dataBuffer [j++];
2872
2873
2874
2875 if (c == ' ') {
2876 while (j < end && dataBuffer [j++] == ' ')
2877 continue;
2878 dataBuffer [i++] = ' ';
2879 dataBuffer [i++] = dataBuffer [j - 1];
2880 } else {
2881 dataBuffer [i++] = c;
2882 }
2883 }
2884
2885
2886 dataBufferPos = i;
2887 }
2888
2889
2890 /***
2891 * Convert the data buffer to a string.
2892 */
2893 private String dataBufferToString ()
2894 {
2895 String s = new String (dataBuffer, 0, dataBufferPos);
2896 dataBufferPos = 0;
2897 return s;
2898 }
2899
2900
2901 /***
2902 * Flush the contents of the data buffer to the handler, as
2903 * appropriate, and reset the buffer for new input.
2904 */
2905 private void dataBufferFlush ()
2906 throws SAXException
2907 {
2908 if (currentElementContent == CONTENT_ELEMENTS
2909 && dataBufferPos > 0
2910 && !inCDATA
2911 ) {
2912
2913
2914 for (int i = 0; i < dataBufferPos; i++) {
2915 if (!isWhitespace (dataBuffer [i])) {
2916 handler.charData (dataBuffer, 0, dataBufferPos);
2917 dataBufferPos = 0;
2918 }
2919 }
2920 if (dataBufferPos > 0) {
2921 handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos);
2922 dataBufferPos = 0;
2923 }
2924 } else if (dataBufferPos > 0) {
2925 handler.charData (dataBuffer, 0, dataBufferPos);
2926 dataBufferPos = 0;
2927 }
2928 }
2929
2930
2931 /***
2932 * Require a string to appear, or throw an exception.
2933 * <p><em>Precondition:</em> Entity expansion is not required.
2934 * <p><em>Precondition:</em> data buffer has no characters that
2935 * will get sent to the application.
2936 */
2937 private void require (String delim)
2938 throws SAXException, IOException
2939 {
2940 int length = delim.length ();
2941 char ch [];
2942
2943 if (length < dataBuffer.length) {
2944 ch = dataBuffer;
2945 delim.getChars (0, length, ch, 0);
2946 } else
2947 ch = delim.toCharArray ();
2948
2949 if (USE_CHEATS
2950 && length <= (readBufferLength - readBufferPos)) {
2951 int offset = readBufferPos;
2952
2953 for (int i = 0; i < length; i++, offset++)
2954 if (ch [i] != readBuffer [offset])
2955 error ("required string", null, delim);
2956 readBufferPos = offset;
2957
2958 } else {
2959 for (int i = 0; i < length; i++)
2960 require (ch [i]);
2961 }
2962 }
2963
2964
2965 /***
2966 * Require a character to appear, or throw an exception.
2967 */
2968 private void require (char delim)
2969 throws SAXException, IOException
2970 {
2971 char c = readCh ();
2972
2973 if (c != delim) {
2974 error ("required character", c, new Character (delim).toString ());
2975 }
2976 }
2977
2978
2979 /***
2980 * Create an interned string from a character array.
2981 * Ælfred uses this method to create an interned version
2982 * of all names and name tokens, so that it can test equality
2983 * with <code>==</code> instead of <code>String.equals ()</code>.
2984 *
2985 * <p>This is much more efficient than constructing a non-interned
2986 * string first, and then interning it.
2987 *
2988 * @param ch an array of characters for building the string.
2989 * @param start the starting position in the array.
2990 * @param length the number of characters to place in the string.
2991 * @return an interned string.
2992 * @see #intern (String)
2993 * @see java.lang.String#intern
2994 */
2995 public String intern (char ch[], int start, int length)
2996 {
2997 int index = 0;
2998 int hash = 0;
2999 Object bucket [];
3000
3001
3002
3003 for (int i = start; i < start + length; i++)
3004 hash = 31 * hash + ch [i];
3005 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3006
3007
3008 if ((bucket = symbolTable [hash]) == null) {
3009
3010 bucket = new Object [8];
3011
3012
3013
3014 } else {
3015 while (index < bucket.length) {
3016 char chFound [] = (char []) bucket [index];
3017
3018
3019 if (chFound == null)
3020 break;
3021
3022
3023 if (chFound.length == length) {
3024 for (int i = 0; i < chFound.length; i++) {
3025
3026 if (ch [start + i] != chFound [i]) {
3027 break;
3028 } else if (i == length - 1) {
3029
3030 return (String) bucket [index + 1];
3031 }
3032 }
3033 }
3034 index += 2;
3035 }
3036
3037
3038
3039 bucket = (Object []) extendArray (bucket, bucket.length, index);
3040 }
3041 symbolTable [hash] = bucket;
3042
3043
3044
3045
3046 String s = new String (ch, start, length).intern ();
3047 bucket [index] = s.toCharArray ();
3048 bucket [index + 1] = s;
3049 return s;
3050 }
3051
3052 /***
3053 * Ensure the capacity of an array, allocating a new one if
3054 * necessary. Usually extends only for name hash collisions.
3055 */
3056 private Object extendArray (Object array, int currentSize, int requiredSize)
3057 {
3058 if (requiredSize < currentSize) {
3059 return array;
3060 } else {
3061 Object newArray = null;
3062 int newSize = currentSize * 2;
3063
3064 if (newSize <= requiredSize)
3065 newSize = requiredSize + 1;
3066
3067 if (array instanceof char[])
3068 newArray = new char [newSize];
3069 else if (array instanceof Object[])
3070 newArray = new Object [newSize];
3071 else
3072 throw new RuntimeException ();
3073
3074 System.arraycopy (array, 0, newArray, 0, currentSize);
3075 return newArray;
3076 }
3077 }
3078
3079
3080
3081
3082
3083
3084
3085 boolean isStandalone () { return docIsStandalone; }
3086
3087
3088
3089
3090
3091
3092 private int getContentType (Object element [], int defaultType)
3093 {
3094 int retval;
3095
3096 if (element == null)
3097 return defaultType;
3098 retval = ((Integer) element [0]).intValue ();
3099 if (retval == CONTENT_UNDECLARED)
3100 retval = defaultType;
3101 return retval;
3102 }
3103
3104
3105 /***
3106 * Look up the content type of an element.
3107 * @param name The element type name.
3108 * @return An integer constant representing the content type.
3109 * @see #CONTENT_UNDECLARED
3110 * @see #CONTENT_ANY
3111 * @see #CONTENT_EMPTY
3112 * @see #CONTENT_MIXED
3113 * @see #CONTENT_ELEMENTS
3114 */
3115 public int getElementContentType (String name)
3116 {
3117 Object element [] = (Object []) elementInfo.get (name);
3118 return getContentType (element, CONTENT_UNDECLARED);
3119 }
3120
3121
3122 /***
3123 * Register an element.
3124 * Array format:
3125 * [0] element type name
3126 * [1] content model (mixed, elements only)
3127 * [2] attribute hash table
3128 */
3129 private void setElement (
3130 String name,
3131 int contentType,
3132 String contentModel,
3133 Hashtable attributes
3134 ) throws SAXException
3135 {
3136 if (skippedPE)
3137 return;
3138
3139 Object element [] = (Object []) elementInfo.get (name);
3140
3141
3142 if (element == null) {
3143 element = new Object [3];
3144 element [0] = new Integer (contentType);
3145 element [1] = contentModel;
3146 element [2] = attributes;
3147 elementInfo.put (name, element);
3148 return;
3149 }
3150
3151
3152 if (contentType != CONTENT_UNDECLARED) {
3153
3154 if (((Integer) element [0]).intValue () == CONTENT_UNDECLARED) {
3155 element [0] = new Integer (contentType);
3156 element [1] = contentModel;
3157 } else
3158
3159 handler.verror ("multiple declarations for element type: "
3160 + name);
3161 }
3162
3163
3164 else if (attributes != null)
3165 element [2] = attributes;
3166 }
3167
3168
3169 /***
3170 * Look up the attribute hash table for an element.
3171 * The hash table is the second item in the element array.
3172 */
3173 private Hashtable getElementAttributes (String name)
3174 {
3175 Object element[] = (Object[]) elementInfo.get (name);
3176 if (element == null)
3177 return null;
3178 else
3179 return (Hashtable) element [2];
3180 }
3181
3182
3183
3184
3185
3186
3187
3188 /***
3189 * Get the declared attributes for an element type.
3190 * @param elname The name of the element type.
3191 * @return An Enumeration of all the attributes declared for
3192 * a specific element type. The results will be valid only
3193 * after the DTD (if any) has been parsed.
3194 * @see #getAttributeType
3195 * @see #getAttributeEnumeration
3196 * @see #getAttributeDefaultValueType
3197 * @see #getAttributeDefaultValue
3198 * @see #getAttributeExpandedValue
3199 */
3200 private Enumeration declaredAttributes (Object element [])
3201 {
3202 Hashtable attlist;
3203
3204 if (element == null)
3205 return null;
3206 if ((attlist = (Hashtable) element [2]) == null)
3207 return null;
3208 return attlist.keys ();
3209 }
3210
3211 /***
3212 * Get the declared attributes for an element type.
3213 * @param elname The name of the element type.
3214 * @return An Enumeration of all the attributes declared for
3215 * a specific element type. The results will be valid only
3216 * after the DTD (if any) has been parsed.
3217 * @see #getAttributeType
3218 * @see #getAttributeEnumeration
3219 * @see #getAttributeDefaultValueType
3220 * @see #getAttributeDefaultValue
3221 * @see #getAttributeExpandedValue
3222 */
3223 public Enumeration declaredAttributes (String elname)
3224 {
3225 return declaredAttributes ((Object []) elementInfo.get (elname));
3226 }
3227
3228
3229 /***
3230 * Retrieve the declared type of an attribute.
3231 * @param name The name of the associated element.
3232 * @param aname The name of the attribute.
3233 * @return An interend string denoting the type, or null
3234 * indicating an undeclared attribute.
3235 */
3236 public String getAttributeType (String name, String aname)
3237 {
3238 Object attribute[] = getAttribute (name, aname);
3239 if (attribute == null) {
3240 return null;
3241 } else {
3242 return (String) attribute [0];
3243 }
3244 }
3245
3246
3247 /***
3248 * Retrieve the allowed values for an enumerated attribute type.
3249 * @param name The name of the associated element.
3250 * @param aname The name of the attribute.
3251 * @return A string containing the token list.
3252 */
3253 public String getAttributeEnumeration (String name, String aname)
3254 {
3255 Object attribute[] = getAttribute (name, aname);
3256 if (attribute == null) {
3257 return null;
3258 } else {
3259
3260 return (String) attribute [3];
3261 }
3262 }
3263
3264
3265 /***
3266 * Retrieve the default value of a declared attribute.
3267 * @param name The name of the associated element.
3268 * @param aname The name of the attribute.
3269 * @return The default value, or null if the attribute was
3270 * #IMPLIED or simply undeclared and unspecified.
3271 * @see #getAttributeExpandedValue
3272 */
3273 public String getAttributeDefaultValue (String name, String aname)
3274 {
3275 Object attribute[] = getAttribute (name, aname);
3276 if (attribute == null) {
3277 return null;
3278 } else {
3279 return (String) attribute [1];
3280 }
3281 }
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326 /***
3327 * Retrieve the default value mode of a declared attribute.
3328 * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3329 * @see #ATTRIBUTE_DEFAULT_IMPLIED
3330 * @see #ATTRIBUTE_DEFAULT_REQUIRED
3331 * @see #ATTRIBUTE_DEFAULT_FIXED
3332 */
3333 public int getAttributeDefaultValueType (String name, String aname)
3334 {
3335 Object attribute[] = getAttribute (name, aname);
3336 if (attribute == null) {
3337 return ATTRIBUTE_DEFAULT_UNDECLARED;
3338 } else {
3339 return ((Integer) attribute [2]).intValue ();
3340 }
3341 }
3342
3343
3344 /***
3345 * Register an attribute declaration for later retrieval.
3346 * Format:
3347 * - String type
3348 * - String default value
3349 * - int value type
3350 * - enumeration
3351 * - processed default value
3352 */
3353 private void setAttribute (String elName, String name, String type,
3354 String enumeration,
3355 String value, int valueType)
3356 throws Exception
3357 {
3358 Hashtable attlist;
3359
3360 if (skippedPE)
3361 return;
3362
3363
3364 attlist = getElementAttributes (elName);
3365 if (attlist == null)
3366 attlist = new Hashtable ();
3367
3368
3369 if (attlist.get (name) != null) {
3370
3371 return;
3372 } else {
3373 Object attribute [] = new Object [5];
3374 attribute [0] = type;
3375 attribute [1] = value;
3376 attribute [2] = new Integer (valueType);
3377 attribute [3] = enumeration;
3378 attribute [4] = null;
3379 attlist.put (name, attribute);
3380
3381
3382 setElement (elName, CONTENT_UNDECLARED, null, attlist);
3383 }
3384 }
3385
3386
3387 /***
3388 * Retrieve the array representing an attribute declaration.
3389 */
3390 private Object[] getAttribute (String elName, String name)
3391 {
3392 Hashtable attlist;
3393
3394 attlist = getElementAttributes (elName);
3395 if (attlist == null)
3396 return null;
3397 return (Object[]) attlist.get (name);
3398 }
3399
3400
3401
3402
3403
3404
3405 /***
3406 * Find the type of an entity.
3407 * @returns An integer constant representing the entity type.
3408 * @see #ENTITY_UNDECLARED
3409 * @see #ENTITY_INTERNAL
3410 * @see #ENTITY_NDATA
3411 * @see #ENTITY_TEXT
3412 */
3413 public int getEntityType (String ename)
3414 {
3415 Object entity[] = (Object[]) entityInfo.get (ename);
3416 if (entity == null) {
3417 return ENTITY_UNDECLARED;
3418 } else {
3419 return ((Integer) entity [0]).intValue ();
3420 }
3421 }
3422
3423
3424 /***
3425 * Return an external entity's identifier array.
3426 * @param ename The name of the external entity.
3427 * @return Three element array containing (in order) the entity's
3428 * public identifier, system identifier, and base URI. Null if
3429 * the entity was not declared as an external entity.
3430 * @see #getEntityType
3431 */
3432 public String [] getEntityIds (String ename)
3433 {
3434 Object entity[] = (Object[]) entityInfo.get (ename);
3435 if (entity == null) {
3436 return null;
3437 } else {
3438 return (String []) entity [1];
3439 }
3440 }
3441
3442
3443 /***
3444 * Return an internal entity's replacement text.
3445 * @param ename The name of the internal entity.
3446 * @return The entity's replacement text, or null if
3447 * the entity was not declared as an internal entity.
3448 * @see #getEntityType
3449 */
3450 public String getEntityValue (String ename)
3451 {
3452 Object entity[] = (Object[]) entityInfo.get (ename);
3453 if (entity == null) {
3454 return null;
3455 } else {
3456 return (String) entity [3];
3457 }
3458 }
3459
3460
3461 /***
3462 * Register an entity declaration for later retrieval.
3463 */
3464 private void setInternalEntity (String eName, String value)
3465 throws SAXException
3466 {
3467 if (skippedPE)
3468 return;
3469
3470 if (entityInfo.get (eName) == null) {
3471 Object entity[] = new Object [5];
3472 entity [0] = new Integer (ENTITY_INTERNAL);
3473
3474 entity [3] = value;
3475 entityInfo.put (eName, entity);
3476 }
3477 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
3478 if ("lt" == eName || "gt" == eName || "quot" == eName
3479 || "apos" == eName || "amp" == eName)
3480 return;
3481 } else {
3482 if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
3483 || "apos".equals(eName) || "amp".equals(eName))
3484 return;
3485 }
3486 handler.getDeclHandler ()
3487 .internalEntityDecl (eName, value);
3488 }
3489
3490
3491 /***
3492 * Register an external entity declaration for later retrieval.
3493 */
3494 private void setExternalEntity (String eName, int eClass,
3495 String ids [], String nName)
3496 {
3497 if (entityInfo.get (eName) == null) {
3498 Object entity[] = new Object [5];
3499 entity [0] = new Integer (eClass);
3500 entity [1] = ids;
3501
3502 entity [4] = nName;
3503 entityInfo.put (eName, entity);
3504 }
3505 }
3506
3507
3508
3509
3510
3511
3512 /***
3513 * Report a notation declaration, checking for duplicates.
3514 */
3515 private void setNotation (String nname, String ids [])
3516 throws SAXException
3517 {
3518 if (skippedPE)
3519 return;
3520
3521 handler.notationDecl (nname, ids);
3522 if (notationInfo.get (nname) == null)
3523 notationInfo.put (nname, nname);
3524 else
3525
3526 handler.verror ("Duplicate notation name decl: " + nname);
3527 }
3528
3529
3530
3531
3532
3533
3534
3535 /***
3536 * Return the current line number.
3537 */
3538 public int getLineNumber ()
3539 {
3540 return line;
3541 }
3542
3543
3544 /***
3545 * Return the current column number.
3546 */
3547 public int getColumnNumber ()
3548 {
3549 return column;
3550 }
3551
3552
3553
3554
3555
3556
3557
3558 /***
3559 * Read a single character from the readBuffer.
3560 * <p>The readDataChunk () method maintains the buffer.
3561 * <p>If we hit the end of an entity, try to pop the stack and
3562 * keep going.
3563 * <p> (This approach doesn't really enforce XML's rules about
3564 * entity boundaries, but this is not currently a validating
3565 * parser).
3566 * <p>This routine also attempts to keep track of the current
3567 * position in external entities, but it's not entirely accurate.
3568 * @return The next available input character.
3569 * @see #unread (char)
3570 * @see #readDataChunk
3571 * @see #readBuffer
3572 * @see #line
3573 * @return The next character from the current input source.
3574 */
3575 private char readCh ()
3576 throws SAXException, IOException
3577 {
3578
3579
3580
3581
3582 while (readBufferPos >= readBufferLength) {
3583 switch (sourceType) {
3584 case INPUT_READER:
3585 case INPUT_STREAM:
3586 readDataChunk ();
3587 while (readBufferLength < 1) {
3588 popInput ();
3589 if (readBufferLength < 1) {
3590 readDataChunk ();
3591 }
3592 }
3593 break;
3594
3595 default:
3596
3597 popInput ();
3598 break;
3599 }
3600 }
3601
3602 char c = readBuffer [readBufferPos++];
3603
3604 if (c == '\n') {
3605 line++;
3606 column = 0;
3607 } else {
3608 if (c == '<') {
3609
3610 } else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3611 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
3612 && xmlVersion == XML_11))
3613 error ("illegal XML character U+"
3614 + Integer.toHexString (c));
3615
3616
3617
3618
3619
3620 else if (c == '%' && expandPE) {
3621 if (peIsError)
3622 error ("PE reference within decl in internal subset.");
3623 parsePEReference ();
3624 return readCh ();
3625 }
3626 column++;
3627 }
3628
3629 return c;
3630 }
3631
3632
3633 /***
3634 * Push a single character back onto the current input stream.
3635 * <p>This method usually pushes the character back onto
3636 * the readBuffer.
3637 * <p>I don't think that this would ever be called with
3638 * readBufferPos = 0, because the methods always reads a character
3639 * before unreading it, but just in case, I've added a boundary
3640 * condition.
3641 * @param c The character to push back.
3642 * @see #readCh
3643 * @see #unread (char[])
3644 * @see #readBuffer
3645 */
3646 private void unread (char c)
3647 throws SAXException
3648 {
3649
3650 if (c == '\n') {
3651 line--;
3652 column = -1;
3653 }
3654 if (readBufferPos > 0) {
3655 readBuffer [--readBufferPos] = c;
3656 } else {
3657 pushString (null, new Character (c).toString ());
3658 }
3659 }
3660
3661
3662 /***
3663 * Push a char array back onto the current input stream.
3664 * <p>NOTE: you must <em>never</em> push back characters that you
3665 * haven't actually read: use pushString () instead.
3666 * @see #readCh
3667 * @see #unread (char)
3668 * @see #readBuffer
3669 * @see #pushString
3670 */
3671 private void unread (char ch[], int length)
3672 throws SAXException
3673 {
3674 for (int i = 0; i < length; i++) {
3675 if (ch [i] == '\n') {
3676 line--;
3677 column = -1;
3678 }
3679 }
3680 if (length < readBufferPos) {
3681 readBufferPos -= length;
3682 } else {
3683 pushCharArray (null, ch, 0, length);
3684 }
3685 }
3686
3687
3688 /***
3689 * Push, or skip, a new external input source.
3690 * The source will be some kind of parsed entity, such as a PE
3691 * (including the external DTD subset) or content for the body.
3692 *
3693 * @param url The java.net.URL object for the entity.
3694 * @see SAXDriver#resolveEntity
3695 * @see #pushString
3696 * @see #sourceType
3697 * @see #pushInput
3698 * @see #detectEncoding
3699 * @see #sourceType
3700 * @see #readBuffer
3701 */
3702 private void pushURL (
3703 boolean isPE,
3704 String ename,
3705 String ids [],
3706 Reader reader,
3707 InputStream stream,
3708 String encoding,
3709 boolean doResolve
3710 ) throws SAXException, IOException
3711 {
3712 boolean ignoreEncoding;
3713 String systemId;
3714 InputSource source;
3715
3716 if (!isPE)
3717 dataBufferFlush ();
3718
3719 scratch.setPublicId (ids [0]);
3720 scratch.setSystemId (ids [1]);
3721
3722
3723
3724
3725 if (doResolve) {
3726
3727 source = handler.resolveEntity (isPE, ename, scratch, ids [2]);
3728 if (source == null) {
3729 handler.warn ("skipping entity: " + ename);
3730 handler.skippedEntity (ename);
3731 if (isPE)
3732 skippedPE = true;
3733 return;
3734 }
3735
3736
3737 systemId = source.getSystemId ();
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747 } else {
3748
3749 scratch.setCharacterStream (reader);
3750 scratch.setByteStream (stream);
3751 scratch.setEncoding (encoding);
3752 source = scratch;
3753 systemId = ids [1];
3754 if (handler.getFeature (SAXDriver.FEATURE + "string-interning")) {
3755 handler.startExternalEntity (ename, systemId,
3756 "[document]" == ename);
3757 } else {
3758 handler.startExternalEntity (ename, systemId,
3759 "[document]".equals(ename));
3760 }
3761 }
3762
3763
3764 if (source.getCharacterStream () != null) {
3765 if (source.getByteStream () != null)
3766 error ("InputSource has two streams!");
3767 reader = source.getCharacterStream ();
3768 } else if (source.getByteStream () != null) {
3769 encoding = source.getEncoding ();
3770 if (encoding == null)
3771 stream = source.getByteStream ();
3772 else try {
3773 reader = new InputStreamReader (
3774 source.getByteStream (),
3775 encoding);
3776 } catch (IOException e) {
3777 stream = source.getByteStream ();
3778 }
3779 } else if (systemId == null)
3780 error ("InputSource has no URI!");
3781 scratch.setCharacterStream (null);
3782 scratch.setByteStream (null);
3783 scratch.setEncoding (null);
3784
3785
3786 pushInput (ename);
3787
3788
3789
3790 readBuffer = new char [READ_BUFFER_MAX + 4];
3791 readBufferPos = 0;
3792 readBufferLength = 0;
3793 readBufferOverflow = -1;
3794 is = null;
3795 line = 1;
3796 column = 0;
3797 currentByteCount = 0;
3798
3799
3800
3801 if (reader != null) {
3802 sourceType = INPUT_READER;
3803 this.reader = reader;
3804 tryEncodingDecl (true);
3805 return;
3806 }
3807
3808
3809
3810 sourceType = INPUT_STREAM;
3811 if (stream != null) {
3812 is = stream;
3813 } else {
3814
3815 URL url = new URL (systemId);
3816
3817 externalEntity = url.openConnection ();
3818 externalEntity.connect ();
3819 is = externalEntity.getInputStream ();
3820 }
3821
3822
3823
3824 if (!is.markSupported ()) {
3825 is = new BufferedInputStream (is);
3826 }
3827
3828
3829 if (encoding == null && externalEntity != null) {
3830
3831
3832
3833 if (!"file".equals (externalEntity.getURL ().getProtocol ())) {
3834 int temp;
3835
3836
3837
3838 encoding = externalEntity.getContentType ();
3839
3840
3841
3842 if (encoding==null) {
3843 temp = -1;
3844 } else {
3845 temp = encoding.indexOf ("charset");
3846 }
3847
3848
3849
3850
3851 if (temp < 0)
3852 encoding = null;
3853 else {
3854
3855 if ((temp = encoding.indexOf (';')) > 0)
3856 encoding = encoding.substring (0, temp);
3857
3858 if ((temp = encoding.indexOf ('=', temp + 7)) > 0) {
3859 encoding = encoding.substring (temp + 1);
3860
3861
3862 if ((temp = encoding.indexOf ('(')) > 0)
3863 encoding = encoding.substring (0, temp);
3864
3865 if ((temp = encoding.indexOf ('"')) > 0)
3866 encoding = encoding.substring (temp + 1,
3867 encoding.indexOf ('"', temp + 2));
3868 encoding.trim ();
3869 } else {
3870 handler.warn ("ignoring illegal MIME attribute: "
3871 + encoding);
3872 encoding = null;
3873 }
3874 }
3875 }
3876 }
3877
3878
3879 if (encoding != null) {
3880 this.encoding = ENCODING_EXTERNAL;
3881 setupDecoding (encoding);
3882 ignoreEncoding = true;
3883
3884
3885 } else {
3886 detectEncoding ();
3887 ignoreEncoding = false;
3888 }
3889
3890
3891
3892 try {
3893 tryEncodingDecl (ignoreEncoding);
3894 } catch (UnsupportedEncodingException x) {
3895 encoding = x.getMessage ();
3896
3897
3898
3899 try {
3900 if (sourceType != INPUT_STREAM)
3901 throw x;
3902
3903 is.reset ();
3904 readBufferPos = 0;
3905 readBufferLength = 0;
3906 readBufferOverflow = -1;
3907 line = 1;
3908 currentByteCount = column = 0;
3909
3910 sourceType = INPUT_READER;
3911 this.reader = new InputStreamReader (is, encoding);
3912 is = null;
3913
3914 tryEncodingDecl (true);
3915
3916 } catch (IOException e) {
3917 error ("unsupported text encoding",
3918 encoding,
3919 null);
3920 }
3921 }
3922 }
3923
3924
3925 /***
3926 * Check for an encoding declaration. This is the second part of the
3927 * XML encoding autodetection algorithm, relying on detectEncoding to
3928 * get to the point that this part can read any encoding declaration
3929 * in the document (using only US-ASCII characters).
3930 *
3931 * <p> Because this part starts to fill parser buffers with this data,
3932 * it's tricky to setup a reader so that Java's built-in decoders can be
3933 * used for the character encodings that aren't built in to this parser
3934 * (such as EUC-JP, KOI8-R, Big5, etc).
3935 *
3936 * @return any encoding in the declaration, uppercased; or null
3937 * @see detectEncoding
3938 */
3939 private String tryEncodingDecl (boolean ignoreEncoding)
3940 throws SAXException, IOException
3941 {
3942
3943 if (tryRead ("<?xml")) {
3944 if (tryWhitespace ()) {
3945 if (inputStack.size () > 0) {
3946 return parseTextDecl (ignoreEncoding);
3947 } else {
3948 return parseXMLDecl (ignoreEncoding);
3949 }
3950 } else {
3951
3952 unread ('l');
3953 unread ('m');
3954 unread ('x');
3955 unread ('?');
3956 unread ('<');
3957 }
3958 }
3959 return null;
3960 }
3961
3962
3963 /***
3964 * Attempt to detect the encoding of an entity.
3965 * <p>The trick here (as suggested in the XML standard) is that
3966 * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
3967 * <b>must</b> begin with an XML declaration or an encoding
3968 * declaration; we simply have to look for "<?xml" in various
3969 * encodings.
3970 * <p>This method has no way to distinguish among 8-bit encodings.
3971 * Instead, it sets up for UTF-8, then (possibly) revises its assumption
3972 * later in setupDecoding (). Any ASCII-derived 8-bit encoding
3973 * should work, but most will be rejected later by setupDecoding ().
3974 * @see #tryEncoding (byte[], byte, byte, byte, byte)
3975 * @see #tryEncoding (byte[], byte, byte)
3976 * @see #setupDecoding
3977 */
3978 private void detectEncoding ()
3979 throws SAXException, IOException
3980 {
3981 byte signature[] = new byte [4];
3982
3983
3984
3985 is.mark (4);
3986 is.read (signature);
3987 is.reset ();
3988
3989
3990
3991
3992 if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3993 (byte) 0x00, (byte) 0x3c)) {
3994
3995
3996
3997 encoding = ENCODING_UCS_4_1234;
3998
3999 } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
4000 (byte) 0x00, (byte) 0x00)) {
4001
4002
4003 encoding = ENCODING_UCS_4_4321;
4004
4005 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
4006 (byte) 0x3c, (byte) 0x00)) {
4007
4008 encoding = ENCODING_UCS_4_2143;
4009
4010 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
4011 (byte) 0x00, (byte) 0x00)) {
4012
4013 encoding = ENCODING_UCS_4_3412;
4014
4015
4016
4017 }
4018
4019
4020
4021
4022
4023
4024
4025 else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) {
4026
4027
4028 encoding = ENCODING_UCS_2_12;
4029 is.read (); is.read ();
4030
4031 } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) {
4032
4033
4034 encoding = ENCODING_UCS_2_21;
4035 is.read (); is.read ();
4036
4037 } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
4038 (byte) 0x00, (byte) 0x3f)) {
4039
4040
4041 encoding = ENCODING_UCS_2_12;
4042 error ("no byte-order mark for UCS-2 entity");
4043
4044 } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
4045 (byte) 0x3f, (byte) 0x00)) {
4046
4047
4048 encoding = ENCODING_UCS_2_21;
4049 error ("no byte-order mark for UCS-2 entity");
4050 }
4051
4052
4053
4054
4055 else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f,
4056 (byte) 0x78, (byte) 0x6d)) {
4057
4058
4059 encoding = ENCODING_UTF_8;
4060 prefetchASCIIEncodingDecl ();
4061
4062 } else if (signature [0] == (byte) 0xef
4063 && signature [1] == (byte) 0xbb
4064 && signature [2] == (byte) 0xbf) {
4065
4066
4067
4068
4069 encoding = ENCODING_UTF_8;
4070 is.read (); is.read (); is.read ();
4071
4072 } else {
4073
4074
4075
4076
4077 encoding = ENCODING_UTF_8;
4078 }
4079 }
4080
4081
4082 /***
4083 * Check for a four-byte signature.
4084 * <p>Utility routine for detectEncoding ().
4085 * <p>Always looks for some part of "<?XML" in a specific encoding.
4086 * @param sig The first four bytes read.
4087 * @param b1 The first byte of the signature
4088 * @param b2 The second byte of the signature
4089 * @param b3 The third byte of the signature
4090 * @param b4 The fourth byte of the signature
4091 * @see #detectEncoding
4092 */
4093 private static boolean tryEncoding (
4094 byte sig[], byte b1, byte b2, byte b3, byte b4)
4095 {
4096 return (sig [0] == b1 && sig [1] == b2
4097 && sig [2] == b3 && sig [3] == b4);
4098 }
4099
4100
4101 /***
4102 * Check for a two-byte signature.
4103 * <p>Looks for a UCS-2 byte-order mark.
4104 * <p>Utility routine for detectEncoding ().
4105 * @param sig The first four bytes read.
4106 * @param b1 The first byte of the signature
4107 * @param b2 The second byte of the signature
4108 * @see #detectEncoding
4109 */
4110 private static boolean tryEncoding (byte sig[], byte b1, byte b2)
4111 {
4112 return ((sig [0] == b1) && (sig [1] == b2));
4113 }
4114
4115
4116 /***
4117 * This method pushes a string back onto input.
4118 * <p>It is useful either as the expansion of an internal entity,
4119 * or for backtracking during the parse.
4120 * <p>Call pushCharArray () to do the actual work.
4121 * @param s The string to push back onto input.
4122 * @see #pushCharArray
4123 */
4124 private void pushString (String ename, String s)
4125 throws SAXException
4126 {
4127 char ch[] = s.toCharArray ();
4128 pushCharArray (ename, ch, 0, ch.length);
4129 }
4130
4131
4132 /***
4133 * Push a new internal input source.
4134 * <p>This method is useful for expanding an internal entity,
4135 * or for unreading a string of characters. It creates a new
4136 * readBuffer containing the characters in the array, instead
4137 * of characters converted from an input byte stream.
4138 * @param ch The char array to push.
4139 * @see #pushString
4140 * @see #pushURL
4141 * @see #readBuffer
4142 * @see #sourceType
4143 * @see #pushInput
4144 */
4145 private void pushCharArray (String ename, char ch[], int start, int length)
4146 throws SAXException
4147 {
4148
4149 pushInput (ename);
4150 if (ename != null && doReport) {
4151 dataBufferFlush ();
4152 handler.startInternalEntity (ename);
4153 }
4154 sourceType = INPUT_INTERNAL;
4155 readBuffer = ch;
4156 readBufferPos = start;
4157 readBufferLength = length;
4158 readBufferOverflow = -1;
4159 }
4160
4161
4162 /***
4163 * Save the current input source onto the stack.
4164 * <p>This method saves all of the global variables associated with
4165 * the current input source, so that they can be restored when a new
4166 * input source has finished. It also tests for entity recursion.
4167 * <p>The method saves the following global variables onto a stack
4168 * using a fixed-length array:
4169 * <ol>
4170 * <li>sourceType
4171 * <li>externalEntity
4172 * <li>readBuffer
4173 * <li>readBufferPos
4174 * <li>readBufferLength
4175 * <li>line
4176 * <li>encoding
4177 * </ol>
4178 * @param ename The name of the entity (if any) causing the new input.
4179 * @see #popInput
4180 * @see #sourceType
4181 * @see #externalEntity
4182 * @see #readBuffer
4183 * @see #readBufferPos
4184 * @see #readBufferLength
4185 * @see #line
4186 * @see #encoding
4187 */
4188 private void pushInput (String ename)
4189 throws SAXException
4190 {
4191
4192 if (ename != null) {
4193 Enumeration entities = entityStack.elements ();
4194 while (entities.hasMoreElements ()) {
4195 String e = (String) entities.nextElement ();
4196 if (e != null && e == ename) {
4197 error ("recursive reference to entity", ename, null);
4198 }
4199 }
4200 }
4201 entityStack.push (ename);
4202
4203
4204 if (sourceType == INPUT_NONE) {
4205 return;
4206 }
4207
4208
4209
4210 Object input[] = new Object [12];
4211
4212 input [0] = new Integer (sourceType);
4213 input [1] = externalEntity;
4214 input [2] = readBuffer;
4215 input [3] = new Integer (readBufferPos);
4216 input [4] = new Integer (readBufferLength);
4217 input [5] = new Integer (line);
4218 input [6] = new Integer (encoding);
4219 input [7] = new Integer (readBufferOverflow);
4220 input [8] = is;
4221 input [9] = new Integer (currentByteCount);
4222 input [10] = new Integer (column);
4223 input [11] = reader;
4224
4225
4226 inputStack.push (input);
4227 }
4228
4229
4230 /***
4231 * Restore a previous input source.
4232 * <p>This method restores all of the global variables associated with
4233 * the current input source.
4234 * @exception java.io.EOFException
4235 * If there are no more entries on the input stack.
4236 * @see #pushInput
4237 * @see #sourceType
4238 * @see #externalEntity
4239 * @see #readBuffer
4240 * @see #readBufferPos
4241 * @see #readBufferLength
4242 * @see #line
4243 * @see #encoding
4244 */
4245 private void popInput ()
4246 throws SAXException, IOException
4247 {
4248 String ename = (String) entityStack.pop ();
4249
4250 if (ename != null && doReport)
4251 dataBufferFlush ();
4252 switch (sourceType) {
4253 case INPUT_STREAM:
4254 handler.endExternalEntity (ename);
4255 is.close ();
4256 break;
4257 case INPUT_READER:
4258 handler.endExternalEntity (ename);
4259 reader.close ();
4260 break;
4261 case INPUT_INTERNAL:
4262 if (ename != null && doReport)
4263 handler.endInternalEntity (ename);
4264 break;
4265 }
4266
4267
4268
4269 if (inputStack.isEmpty ()) {
4270 throw new EOFException ("no more input");
4271 }
4272
4273 Object input [] = (Object[]) inputStack.pop ();
4274
4275 sourceType = ((Integer) input [0]).intValue ();
4276 externalEntity = (URLConnection) input [1];
4277 readBuffer = (char[]) input [2];
4278 readBufferPos = ((Integer) input [3]).intValue ();
4279 readBufferLength = ((Integer) input [4]).intValue ();
4280 line = ((Integer) input [5]).intValue ();
4281 encoding = ((Integer) input [6]).intValue ();
4282 readBufferOverflow = ((Integer) input [7]).intValue ();
4283 is = (InputStream) input [8];
4284 currentByteCount = ((Integer) input [9]).intValue ();
4285 column = ((Integer) input [10]).intValue ();
4286 reader = (Reader) input [11];
4287 }
4288
4289
4290 /***
4291 * Return true if we can read the expected character.
4292 * <p>Note that the character will be removed from the input stream
4293 * on success, but will be put back on failure. Do not attempt to
4294 * read the character again if the method succeeds.
4295 * @param delim The character that should appear next. For a
4296 * insensitive match, you must supply this in upper-case.
4297 * @return true if the character was successfully read, or false if
4298 * it was not.
4299 * @see #tryRead (String)
4300 */
4301 private boolean tryRead (char delim)
4302 throws SAXException, IOException
4303 {
4304 char c;
4305
4306
4307 c = readCh ();
4308
4309
4310
4311 if (c == delim) {
4312 return true;
4313 } else {
4314 unread (c);
4315 return false;
4316 }
4317 }
4318
4319
4320 /***
4321 * Return true if we can read the expected string.
4322 * <p>This is simply a convenience method.
4323 * <p>Note that the string will be removed from the input stream
4324 * on success, but will be put back on failure. Do not attempt to
4325 * read the string again if the method succeeds.
4326 * <p>This method will push back a character rather than an
4327 * array whenever possible (probably the majority of cases).
4328 * @param delim The string that should appear next.
4329 * @return true if the string was successfully read, or false if
4330 * it was not.
4331 * @see #tryRead (char)
4332 */
4333 private boolean tryRead (String delim)
4334 throws SAXException, IOException
4335 {
4336 return tryRead (delim.toCharArray ());
4337 }
4338
4339 private boolean tryRead (char ch [])
4340 throws SAXException, IOException
4341 {
4342 char c;
4343
4344
4345
4346
4347 for (int i = 0; i < ch.length; i++) {
4348 c = readCh ();
4349 if (c != ch [i]) {
4350 unread (c);
4351 if (i != 0) {
4352 unread (ch, i);
4353 }
4354 return false;
4355 }
4356 }
4357 return true;
4358 }
4359
4360
4361
4362 /***
4363 * Return true if we can read some whitespace.
4364 * <p>This is simply a convenience method.
4365 * <p>This method will push back a character rather than an
4366 * array whenever possible (probably the majority of cases).
4367 * @return true if whitespace was found.
4368 */
4369 private boolean tryWhitespace ()
4370 throws SAXException, IOException
4371 {
4372 char c;
4373 c = readCh ();
4374 if (isWhitespace (c)) {
4375 skipWhitespace ();
4376 return true;
4377 } else {
4378 unread (c);
4379 return false;
4380 }
4381 }
4382
4383
4384 /***
4385 * Read all data until we find the specified string.
4386 * This is useful for scanning CDATA sections and PIs.
4387 * <p>This is inefficient right now, since it calls tryRead ()
4388 * for every character.
4389 * @param delim The string delimiter
4390 * @see #tryRead (String, boolean)
4391 * @see #readCh
4392 */
4393 private void parseUntil (String delim)
4394 throws SAXException, IOException
4395 {
4396 parseUntil (delim.toCharArray ());
4397 }
4398
4399 private void parseUntil (char delim [])
4400 throws SAXException, IOException
4401 {
4402 char c;
4403 int startLine = line;
4404
4405 try {
4406 while (!tryRead (delim)) {
4407 c = readCh ();
4408 dataBufferAppend (c);
4409 }
4410 } catch (EOFException e) {
4411 error ("end of input while looking for delimiter "
4412 + "(started on line " + startLine
4413 + ')', null, new String (delim));
4414 }
4415 }
4416
4417
4418
4419
4420
4421
4422
4423 /***
4424 * Prefetch US-ASCII XML/text decl from input stream into read buffer.
4425 * Doesn't buffer more than absolutely needed, so that when an encoding
4426 * decl says we need to create an InputStreamReader, we can discard our
4427 * buffer and reset(). Caller knows the first chars of the decl exist
4428 * in the input stream.
4429 */
4430 private void prefetchASCIIEncodingDecl ()
4431 throws SAXException, IOException
4432 {
4433 int ch;
4434 readBufferPos = readBufferLength = 0;
4435
4436 is.mark (readBuffer.length);
4437 while (true) {
4438 ch = is.read ();
4439 readBuffer [readBufferLength++] = (char) ch;
4440 switch (ch) {
4441 case (int) '>':
4442 return;
4443 case -1:
4444 error ("file ends before end of XML or encoding declaration.",
4445 null, "?>");
4446 }
4447 if (readBuffer.length == readBufferLength)
4448 error ("unfinished XML or encoding declaration");
4449 }
4450 }
4451
4452 /***
4453 * Read a chunk of data from an external input source.
4454 * <p>This is simply a front-end that fills the rawReadBuffer
4455 * with bytes, then calls the appropriate encoding handler.
4456 * @see #encoding
4457 * @see #rawReadBuffer
4458 * @see #readBuffer
4459 * @see #filterCR
4460 * @see #copyUtf8ReadBuffer
4461 * @see #copyIso8859_1ReadBuffer
4462 * @see #copyUcs_2ReadBuffer
4463 * @see #copyUcs_4ReadBuffer
4464 */
4465 private void readDataChunk ()
4466 throws SAXException, IOException
4467 {
4468 int count;
4469
4470
4471 if (readBufferOverflow > -1) {
4472 readBuffer [0] = (char) readBufferOverflow;
4473 readBufferOverflow = -1;
4474 readBufferPos = 1;
4475 sawCR = true;
4476 } else {
4477 readBufferPos = 0;
4478 sawCR = false;
4479 }
4480
4481
4482 if (sourceType == INPUT_READER) {
4483 count = reader.read (readBuffer,
4484 readBufferPos, READ_BUFFER_MAX - readBufferPos);
4485 if (count < 0)
4486 readBufferLength = readBufferPos;
4487 else
4488 readBufferLength = readBufferPos + count;
4489 if (readBufferLength > 0)
4490 filterCR (count >= 0);
4491 sawCR = false;
4492 return;
4493 }
4494
4495
4496 count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX);
4497
4498
4499
4500
4501 if (count > 0) {
4502 switch (encoding) {
4503
4504 case ENCODING_ASCII:
4505 copyIso8859_1ReadBuffer (count, (char) 0x0080);
4506 break;
4507 case ENCODING_UTF_8:
4508 copyUtf8ReadBuffer (count);
4509 break;
4510 case ENCODING_ISO_8859_1:
4511 copyIso8859_1ReadBuffer (count, (char) 0);
4512 break;
4513
4514
4515 case ENCODING_UCS_2_12:
4516 copyUcs2ReadBuffer (count, 8, 0);
4517 break;
4518 case ENCODING_UCS_2_21:
4519 copyUcs2ReadBuffer (count, 0, 8);
4520 break;
4521
4522
4523 case ENCODING_UCS_4_1234:
4524 copyUcs4ReadBuffer (count, 24, 16, 8, 0);
4525 break;
4526 case ENCODING_UCS_4_4321:
4527 copyUcs4ReadBuffer (count, 0, 8, 16, 24);
4528 break;
4529 case ENCODING_UCS_4_2143:
4530 copyUcs4ReadBuffer (count, 16, 24, 0, 8);
4531 break;
4532 case ENCODING_UCS_4_3412:
4533 copyUcs4ReadBuffer (count, 8, 0, 24, 16);
4534 break;
4535 }
4536 } else
4537 readBufferLength = readBufferPos;
4538
4539 readBufferPos = 0;
4540
4541
4542
4543 if (sawCR) {
4544 filterCR (count >= 0);
4545 sawCR = false;
4546
4547
4548 if (readBufferLength == 0 && count >= 0)
4549 readDataChunk ();
4550 }
4551
4552 if (count > 0)
4553 currentByteCount += count;
4554 }
4555
4556
4557 /***
4558 * Filter carriage returns in the read buffer.
4559 * CRLF becomes LF; CR becomes LF.
4560 * @param moreData true iff more data might come from the same source
4561 * @see #readDataChunk
4562 * @see #readBuffer
4563 * @see #readBufferOverflow
4564 */
4565 private void filterCR (boolean moreData)
4566 {
4567 int i, j;
4568
4569 readBufferOverflow = -1;
4570
4571 loop:
4572 for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4573 switch (readBuffer [j]) {
4574 case '\r':
4575 if (j == readBufferLength - 1) {
4576 if (moreData) {
4577 readBufferOverflow = '\r';
4578 readBufferLength--;
4579 } else
4580 readBuffer [i++] = '\n';
4581 break loop;
4582 } else if (readBuffer [j + 1] == '\n') {
4583 j++;
4584 }
4585 readBuffer [i] = '\n';
4586 break;
4587
4588 case '\n':
4589 default:
4590 readBuffer [i] = readBuffer [j];
4591 break;
4592 }
4593 }
4594 readBufferLength = i;
4595 }
4596
4597 /***
4598 * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
4599 * <p>When readDataChunk () calls this method, the raw bytes are in
4600 * rawReadBuffer, and the final characters will appear in
4601 * readBuffer.
4602 * <p>Note that as of Unicode 3.1, good practice became a requirement,
4603 * so that each Unicode character has exactly one UTF-8 representation.
4604 * @param count The number of bytes to convert.
4605 * @see #readDataChunk
4606 * @see #rawReadBuffer
4607 * @see #readBuffer
4608 * @see #getNextUtf8Byte
4609 */
4610 private void copyUtf8ReadBuffer (int count)
4611 throws SAXException, IOException
4612 {
4613 int i = 0;
4614 int j = readBufferPos;
4615 int b1;
4616 char c = 0;
4617
4618
4619
4620
4621
4622
4623
4624 while (i < count) {
4625 b1 = rawReadBuffer [i++];
4626
4627
4628
4629
4630 if (b1 < 0) {
4631 if ((b1 & 0xe0) == 0xc0) {
4632
4633 c = (char) (((b1 & 0x1f) << 6)
4634 | getNextUtf8Byte (i++, count));
4635 if (c < 0x0080)
4636 encodingError ("Illegal two byte UTF-8 sequence",
4637 c, 0);
4638
4639
4640
4641 if ((c == 0x0085 || c == 0x000a) && sawCR)
4642 continue;
4643
4644
4645
4646
4647 if(c == 0x0085 && xmlVersion == XML_11)
4648 readBuffer[j++] = '\r';
4649 } else if ((b1 & 0xf0) == 0xe0) {
4650
4651
4652
4653 c = (char) (((b1 & 0x0f) << 12) |
4654 (getNextUtf8Byte (i++, count) << 6) |
4655 getNextUtf8Byte (i++, count));
4656
4657
4658 if(c == 0x2028 && xmlVersion == XML_11){
4659 readBuffer[j++] = '\r';
4660 sawCR = true;
4661 continue;
4662 }
4663 if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
4664 encodingError ("Illegal three byte UTF-8 sequence",
4665 c, 0);
4666 } else if ((b1 & 0xf8) == 0xf0) {
4667
4668
4669
4670
4671
4672 int iso646 = b1 & 07;
4673 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4674 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4675 iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4676
4677 if (iso646 <= 0xffff) {
4678 encodingError ("Illegal four byte UTF-8 sequence",
4679 iso646, 0);
4680 } else {
4681 if (iso646 > 0x0010ffff)
4682 encodingError (
4683 "UTF-8 value out of range for Unicode",
4684 iso646, 0);
4685 iso646 -= 0x010000;
4686 readBuffer [j++] = (char) (0xd800 | (iso646 >> 10));
4687 readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff));
4688 continue;
4689 }
4690 } else {
4691
4692
4693 encodingError (
4694 "unsupported five or six byte UTF-8 sequence",
4695 0xff & b1, i);
4696
4697 c = 0;
4698 }
4699 } else {
4700
4701
4702 c = (char) b1;
4703 }
4704 readBuffer [j++] = c;
4705 if (c == '\r')
4706 sawCR = true;
4707 }
4708
4709 readBufferLength = j;
4710 }
4711
4712
4713 /***
4714 * Return the next byte value in a UTF-8 sequence.
4715 * If it is not possible to get a byte from the current
4716 * entity, throw an exception.
4717 * @param pos The current position in the rawReadBuffer.
4718 * @param count The number of bytes in the rawReadBuffer
4719 * @return The significant six bits of a non-initial byte in
4720 * a UTF-8 sequence.
4721 * @exception EOFException If the sequence is incomplete.
4722 */
4723 private int getNextUtf8Byte (int pos, int count)
4724 throws SAXException, IOException
4725 {
4726 int val;
4727
4728
4729
4730 if (pos < count) {
4731 val = rawReadBuffer [pos];
4732 } else {
4733 val = is.read ();
4734 if (val == -1) {
4735 encodingError ("unfinished multi-byte UTF-8 sequence at EOF",
4736 -1, pos);
4737 }
4738 }
4739
4740
4741 if ((val & 0xc0) != 0x80) {
4742 encodingError ("bad continuation of multi-byte UTF-8 sequence",
4743 val, pos + 1);
4744 }
4745
4746
4747 return (val & 0x3f);
4748 }
4749
4750
4751 /***
4752 * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
4753 * UTF-16 characters.
4754 *
4755 * <p>When readDataChunk () calls this method, the raw bytes are in
4756 * rawReadBuffer, and the final characters will appear in
4757 * readBuffer.
4758 *
4759 * @param count The number of bytes to convert.
4760 * @param mask For ASCII conversion, 0x7f; else, 0xff.
4761 * @see #readDataChunk
4762 * @see #rawReadBuffer
4763 * @see #readBuffer
4764 */
4765 private void copyIso8859_1ReadBuffer (int count, char mask)
4766 throws IOException
4767 {
4768 int i, j;
4769 for (i = 0, j = readBufferPos; i < count; i++, j++) {
4770 char c = (char) (rawReadBuffer [i] & 0xff);
4771 if ((c & mask) != 0)
4772 throw new CharConversionException ("non-ASCII character U+"
4773 + Integer.toHexString (c));
4774 if (c == 0x0085 && xmlVersion == XML_11)
4775 c = '\r';
4776 readBuffer [j] = c;
4777 if (c == '\r') {
4778 sawCR = true;
4779 }
4780 }
4781 readBufferLength = j;
4782 }
4783
4784
4785 /***
4786 * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
4787 * (as used in Java string manipulation).
4788 *
4789 * <p>When readDataChunk () calls this method, the raw bytes are in
4790 * rawReadBuffer, and the final characters will appear in
4791 * readBuffer.
4792 * @param count The number of bytes to convert.
4793 * @param shift1 The number of bits to shift byte 1.
4794 * @param shift2 The number of bits to shift byte 2
4795 * @see #readDataChunk
4796 * @see #rawReadBuffer
4797 * @see #readBuffer
4798 */
4799 private void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4800 throws SAXException
4801 {
4802 int j = readBufferPos;
4803
4804 if (count > 0 && (count % 2) != 0) {
4805 encodingError ("odd number of bytes in UCS-2 encoding", -1, count);
4806 }
4807
4808 if (shift1 == 0) {
4809 for (int i = 0; i < count; i += 2) {
4810 char c = (char) (rawReadBuffer [i + 1] << 8);
4811 c |= 0xff & rawReadBuffer [i];
4812 readBuffer [j++] = c;
4813 if (c == '\r')
4814 sawCR = true;
4815 }
4816 } else {
4817 for (int i = 0; i < count; i += 2) {
4818 char c = (char) (rawReadBuffer [i] << 8);
4819 c |= 0xff & rawReadBuffer [i + 1];
4820 readBuffer [j++] = c;
4821 if (c == '\r')
4822 sawCR = true;
4823 }
4824 }
4825 readBufferLength = j;
4826 }
4827
4828
4829 /***
4830 * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4831 *
4832 * <p>When readDataChunk () calls this method, the raw bytes are in
4833 * rawReadBuffer, and the final characters will appear in
4834 * readBuffer.
4835 * <p>Java has Unicode chars, and this routine uses surrogate pairs
4836 * for ISO-10646 values between 0x00010000 and 0x000fffff. An
4837 * exception is thrown if the ISO-10646 character has no Unicode
4838 * representation.
4839 *
4840 * @param count The number of bytes to convert.
4841 * @param shift1 The number of bits to shift byte 1.
4842 * @param shift2 The number of bits to shift byte 2
4843 * @param shift3 The number of bits to shift byte 2
4844 * @param shift4 The number of bits to shift byte 2
4845 * @see #readDataChunk
4846 * @see #rawReadBuffer
4847 * @see #readBuffer
4848 */
4849 private void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4850 int shift3, int shift4)
4851 throws SAXException
4852 {
4853 int j = readBufferPos;
4854
4855 if (count > 0 && (count % 4) != 0) {
4856 encodingError (
4857 "number of bytes in UCS-4 encoding not divisible by 4",
4858 -1, count);
4859 }
4860 for (int i = 0; i < count; i += 4) {
4861 int value = (((rawReadBuffer [i] & 0xff) << shift1) |
4862 ((rawReadBuffer [i + 1] & 0xff) << shift2) |
4863 ((rawReadBuffer [i + 2] & 0xff) << shift3) |
4864 ((rawReadBuffer [i + 3] & 0xff) << shift4));
4865 if (value < 0x0000ffff) {
4866 readBuffer [j++] = (char) value;
4867 if (value == (int) '\r') {
4868 sawCR = true;
4869 }
4870 } else if (value < 0x0010ffff) {
4871 value -= 0x010000;
4872 readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
4873 readBuffer [j++] = (char) (0xdc | (value & 0x03ff));
4874 } else {
4875 encodingError ("UCS-4 value out of range for Unicode",
4876 value, i);
4877 }
4878 }
4879 readBufferLength = j;
4880 }
4881
4882
4883 /***
4884 * Report a character encoding error.
4885 */
4886 private void encodingError (String message, int value, int offset)
4887 throws SAXException
4888 {
4889 if (value != -1)
4890 message = message + " (character code: 0x" +
4891 Integer.toHexString (value) + ')';
4892 error (message);
4893 }
4894
4895
4896
4897
4898
4899
4900 /***
4901 * Re-initialize the variables for each parse.
4902 */
4903 private void initializeVariables ()
4904 {
4905
4906 line = 1;
4907 column = 0;
4908
4909
4910 dataBufferPos = 0;
4911 dataBuffer = new char [DATA_BUFFER_INITIAL];
4912 nameBufferPos = 0;
4913 nameBuffer = new char [NAME_BUFFER_INITIAL];
4914
4915
4916 elementInfo = new Hashtable ();
4917 entityInfo = new Hashtable ();
4918 notationInfo = new Hashtable ();
4919 skippedPE = false;
4920
4921
4922
4923 currentElement = null;
4924 currentElementContent = CONTENT_UNDECLARED;
4925
4926
4927 sourceType = INPUT_NONE;
4928 inputStack = new Stack ();
4929 entityStack = new Stack ();
4930 externalEntity = null;
4931 tagAttributePos = 0;
4932 tagAttributes = new String [100];
4933 rawReadBuffer = new byte [READ_BUFFER_MAX];
4934 readBufferOverflow = -1;
4935
4936 scratch = new InputSource ();
4937
4938 inLiteral = false;
4939 expandPE = false;
4940 peIsError = false;
4941
4942 doReport = false;
4943
4944 inCDATA = false;
4945
4946 symbolTable = new Object [SYMBOL_TABLE_LENGTH][];
4947 }
4948
4949
4950
4951
4952
4953 private SAXDriver handler;
4954
4955
4956
4957
4958 private Reader reader;
4959 private InputStream is;
4960 private int line;
4961 private int column;
4962 private int sourceType;
4963 private Stack inputStack;
4964 private URLConnection externalEntity;
4965 private int encoding;
4966 private int currentByteCount;
4967 private InputSource scratch;
4968
4969
4970
4971
4972 private char readBuffer [];
4973 private int readBufferPos;
4974 private int readBufferLength;
4975 private int readBufferOverflow;
4976
4977
4978
4979
4980
4981 private final static int READ_BUFFER_MAX = 16384;
4982 private byte rawReadBuffer [];
4983
4984
4985
4986
4987
4988 private static int DATA_BUFFER_INITIAL = 4096;
4989 private char dataBuffer [];
4990 private int dataBufferPos;
4991
4992
4993
4994
4995 private static int NAME_BUFFER_INITIAL = 1024;
4996 private char nameBuffer [];
4997 private int nameBufferPos;
4998
4999
5000
5001
5002 private boolean docIsStandalone;
5003
5004
5005
5006
5007
5008 private Hashtable elementInfo;
5009 private Hashtable entityInfo;
5010 private Hashtable notationInfo;
5011 private boolean skippedPE;
5012
5013
5014
5015
5016
5017 private String currentElement;
5018 private int currentElementContent;
5019
5020
5021
5022
5023 private Stack entityStack;
5024
5025
5026
5027
5028
5029 private boolean inLiteral;
5030 private boolean expandPE;
5031 private boolean peIsError;
5032
5033
5034
5035
5036
5037
5038 private boolean doReport;
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055 private final static int SYMBOL_TABLE_LENGTH = 2039;
5056
5057 private Object symbolTable [][];
5058
5059
5060
5061
5062 private String tagAttributes [];
5063 private int tagAttributePos;
5064
5065
5066
5067
5068
5069
5070 private boolean sawCR;
5071
5072
5073
5074
5075 private boolean inCDATA;
5076
5077
5078
5079
5080 private static final int XML_10 = 0;
5081 private static final int XML_11 = 1;
5082 private int xmlVersion = XML_10;
5083 }