View Javadoc

1   /*
2    * Copyright (c) 1999-2000 by David Brownell.  All Rights Reserved.
3    *
4    * This program is open source software; you may use, copy, modify, and
5    * redistribute it under the terms of the LICENSE with which it was
6    * originally distributed.
7    *
8    * This program is distributed in the hope that it will be useful,
9    * but WITHOUT ANY WARRANTY; without even the implied warranty of
10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   * LICENSE for more details.
12   */
13  
14  //
15  // Copyright (c) 1997, 1998 by Microstar Software Ltd.
16  // From Microstar's README (the entire original license):
17  //
18  // AElfred is free for both commercial and non-commercial use and
19  // redistribution, provided that Microstar's copyright and disclaimer are
20  // retained intact.  You are free to modify AElfred for your own use and
21  // to redistribute AElfred with your modifications, provided that the
22  // modifications are clearly documented.
23  //
24  // This program is distributed in the hope that it will be useful, but
25  // WITHOUT ANY WARRANTY; without even the implied warranty of
26  // merchantability or fitness for a particular purpose.  Please use it AT
27  // YOUR OWN RISK.
28  //
29  
30  
31  package org.dom4j.io.aelfred;
32  
33  import java.io.BufferedInputStream;
34  import java.io.CharConversionException;
35  import java.io.EOFException;
36  import java.io.IOException;
37  import java.io.InputStream;
38  import java.io.InputStreamReader;
39  import java.io.Reader;
40  import java.net.URL;
41  import java.net.URLConnection;
42  import java.util.ArrayList;
43  import java.util.HashMap;
44  import java.util.Iterator;
45  
46  import org.xml.sax.SAXException;
47  
48  
49  // $Id: XmlParser.java,v 1.7 2002/05/24 14:41:55 jstrachan dead $
50  
51  /***
52   * Parse XML documents and return parse events through call-backs.
53   * Use the <code>SAXDriver</code> class as your entry point, as the
54   * internal parser interfaces are subject to change.
55   *
56   * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
57   *  (version 1.2a with bugfixes)
58   * @author Updated by David Brownell &lt;david-b@pacbell.net&gt;
59   * @version $Date: 2002/05/24 14:41:55 $
60   * @see SAXDriver
61   * @deprecated Use Aelfred2 instead! THIS CLASS WILL BE REMOVED IN dom4j-1.6 !!
62   */
63  final class XmlParser
64  {
65      //
66      // Use special cheats that speed up the code by
67      // avoiding per-character readCh () method calls.
68      //
69      private final static boolean USE_CHEATS = true;
70  
71  
72      //////////////////////////////////////////////////////////////////////
73      // Constructors.
74      ////////////////////////////////////////////////////////////////////////
75  
76  
77      /***
78       * Construct a new parser with no associated handler.
79       * @see #setHandler
80       * @see #parse
81       */
82      // package private
83      XmlParser ()
84      {
85          cleanupVariables ();
86      }
87  
88  
89      /***
90       * Set the handler that will receive parsing events.
91       * @param handler The handler to receive callback events.
92       * @see #parse
93       */
94      // package private
95      void setHandler (SAXDriver handler)
96      {
97          this.handler = handler;
98      }
99  
100 
101     /***
102      * Parse an XML document from the character stream, byte stream, or URI
103      * that you provide (in that order of preference).  Any URI that you
104      * supply will become the base URI for resolving relative URI, and may
105      * be used to acquire a reader or byte stream.
106      *
107      * <p>You may parse more than one document, but that must be done
108      * sequentially.  Only one thread at a time may use this parser.
109      *
110      * @param systemId The URI of the document; should never be null,
111      *  but may be so iff a reader <em>or</em> a stream is provided.
112      * @param publicId The public identifier of the document, or null.
113      * @param reader A character stream; must be null if stream isn't.
114      * @param stream A byte input stream; must be null if reader isn't.
115      * @param encoding The suggested encoding, or null if unknown.
116      * @exception java.lang.Exception Basically SAXException or IOException
117      */
118     // package private 
119     void doParse (
120     String      systemId,
121     String      publicId,
122     Reader      reader,
123     InputStream stream,
124     String      encoding
125     ) throws Exception
126     {
127     if (handler == null)
128         throw new IllegalStateException ("no callback handler");
129 
130     basePublicId = publicId;
131     baseURI = systemId;
132     baseReader = reader;
133     baseInputStream = stream;
134 
135     initializeVariables ();
136 
137     // predeclare the built-in entities here (replacement texts)
138     // we don't need to intern(), since we're guaranteed literals
139     // are always (globally) interned.
140     setInternalEntity ("amp", "&#38;");
141     setInternalEntity ("lt", "&#60;");
142     setInternalEntity ("gt", "&#62;");
143     setInternalEntity ("apos", "&#39;");
144     setInternalEntity ("quot", "&#34;");
145 
146     handler.startDocument ();
147 
148     pushURL ("[document]", basePublicId, baseURI,
149         baseReader, baseInputStream, encoding);
150 
151     try {
152         parseDocument ();
153         handler.endDocument ();
154     } finally {
155         if (baseReader != null)
156         try { baseReader.close ();
157         } catch (IOException e) { /* ignore */ }
158         if (baseInputStream != null)
159         try { baseInputStream.close ();
160         } catch (IOException e) { /* ignore */ }
161         if (is != null)
162         try { is.close ();
163         } catch (IOException e) { /* ignore */ }
164         if (reader != null)
165         try {
166             reader.close ();
167         } catch (IOException e) { /* ignore */
168         }
169         cleanupVariables ();
170     }
171     }
172 
173 
174     ////////////////////////////////////////////////////////////////////////
175     // Constants.
176     ////////////////////////////////////////////////////////////////////////
177 
178     //
179     // Constants for element content type.
180     //
181 
182     /***
183      * Constant: an element has not been declared.
184      * @see #getElementContentType
185      */
186     public final static int CONTENT_UNDECLARED = 0;
187 
188     /***
189      * Constant: the element has a content model of ANY.
190      * @see #getElementContentType
191      */
192     public final static int CONTENT_ANY = 1;
193 
194     /***
195      * Constant: the element has declared content of EMPTY.
196      * @see #getElementContentType
197      */
198     public final static int CONTENT_EMPTY = 2;
199 
200     /***
201      * Constant: the element has mixed content.
202      * @see #getElementContentType
203      */
204     public final static int CONTENT_MIXED = 3;
205 
206     /***
207      * Constant: the element has element content.
208      * @see #getElementContentType
209      */
210     public final static int CONTENT_ELEMENTS = 4;
211 
212 
213     //
214     // Constants for the entity type.
215     //
216 
217     /***
218      * Constant: the entity has not been declared.
219      * @see #getEntityType
220      */
221     public final static int ENTITY_UNDECLARED = 0;
222 
223     /***
224      * Constant: the entity is internal.
225      * @see #getEntityType
226      */
227     public final static int ENTITY_INTERNAL = 1;
228 
229     /***
230      * Constant: the entity is external, non-XML data.
231      * @see #getEntityType
232      */
233     public final static int ENTITY_NDATA = 2;
234 
235     /***
236      * Constant: the entity is external XML data.
237      * @see #getEntityType
238      */
239     public final static int ENTITY_TEXT = 3;
240 
241 
242     //
243     // Constants for attribute type.
244     //
245 
246     /***
247      * Constant: the attribute has not been declared for this element type.
248      * @see #getAttributeType
249      */
250     public final static int ATTRIBUTE_UNDECLARED = 0;
251 
252     /***
253      * Constant: the attribute value is a string value.
254      * @see #getAttributeType
255      */
256     public final static int ATTRIBUTE_CDATA = 1;
257 
258     /***
259      * Constant: the attribute value is a unique identifier.
260      * @see #getAttributeType
261      */
262     public final static int ATTRIBUTE_ID = 2;
263 
264     /***
265      * Constant: the attribute value is a reference to a unique identifier.
266      * @see #getAttributeType
267      */
268     public final static int ATTRIBUTE_IDREF = 3;
269 
270     /***
271      * Constant: the attribute value is a list of ID references.
272      * @see #getAttributeType
273      */
274     public final static int ATTRIBUTE_IDREFS = 4;
275 
276     /***
277      * Constant: the attribute value is the name of an entity.
278      * @see #getAttributeType
279      */
280     public final static int ATTRIBUTE_ENTITY = 5;
281 
282     /***
283      * Constant: the attribute value is a list of entity names.
284      * @see #getAttributeType
285      */
286     public final static int ATTRIBUTE_ENTITIES = 6;
287 
288     /***
289      * Constant: the attribute value is a name token.
290      * @see #getAttributeType
291      */
292     public final static int ATTRIBUTE_NMTOKEN = 7;
293 
294     /***
295      * Constant: the attribute value is a list of name tokens.
296      * @see #getAttributeType
297      */
298     public final static int ATTRIBUTE_NMTOKENS = 8;
299 
300     /***
301      * Constant: the attribute value is a token from an enumeration.
302      * @see #getAttributeType
303      */
304     public final static int ATTRIBUTE_ENUMERATED = 9;
305 
306     /***
307      * Constant: the attribute is the name of a notation.
308      * @see #getAttributeType
309      */
310     public final static int ATTRIBUTE_NOTATION = 10;
311 
312 
313     //
314     // When the class is loaded, populate the hash table of
315     // attribute types.
316     //
317 
318     /***
319      * Hash table of attribute types.
320      */
321     private static HashMap attributeTypeHash;
322     static {
323     attributeTypeHash = new HashMap (13);
324     attributeTypeHash.put ("CDATA", new Integer (ATTRIBUTE_CDATA));
325     attributeTypeHash.put ("ID", new Integer (ATTRIBUTE_ID));
326     attributeTypeHash.put ("IDREF", new Integer (ATTRIBUTE_IDREF));
327     attributeTypeHash.put ("IDREFS", new Integer (ATTRIBUTE_IDREFS));
328     attributeTypeHash.put ("ENTITY", new Integer (ATTRIBUTE_ENTITY));
329     attributeTypeHash.put ("ENTITIES", new Integer (ATTRIBUTE_ENTITIES));
330     attributeTypeHash.put ("NMTOKEN", new Integer (ATTRIBUTE_NMTOKEN));
331     attributeTypeHash.put ("NMTOKENS", new Integer (ATTRIBUTE_NMTOKENS));
332     attributeTypeHash.put ("NOTATION", new Integer (ATTRIBUTE_NOTATION));
333     }
334 
335 
336     //
337     // Constants for supported encodings.  "external" is just a flag.
338     //
339     private final static int ENCODING_EXTERNAL = 0;
340     private final static int ENCODING_UTF_8 = 1;
341     private final static int ENCODING_ISO_8859_1 = 2;
342     private final static int ENCODING_UCS_2_12 = 3;
343     private final static int ENCODING_UCS_2_21 = 4;
344     private final static int ENCODING_UCS_4_1234 = 5;
345     private final static int ENCODING_UCS_4_4321 = 6;
346     private final static int ENCODING_UCS_4_2143 = 7;
347     private final static int ENCODING_UCS_4_3412 = 8;
348     private final static int ENCODING_ASCII = 9;
349 
350 
351     //
352     // Constants for attribute default value.
353     //
354 
355     /***
356      * Constant: the attribute is not declared.
357      * @see #getAttributeDefaultValueType
358      */
359     public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
360 
361     /***
362      * Constant: the attribute has a literal default value specified.
363      * @see #getAttributeDefaultValueType
364      * @see #getAttributeDefaultValue
365      */
366     public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
367 
368     /***
369      * Constant: the attribute was declared #IMPLIED.
370      * @see #getAttributeDefaultValueType
371      */
372     public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
373 
374     /***
375      * Constant: the attribute was declared #REQUIRED.
376      * @see #getAttributeDefaultValueType
377      */
378     public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
379 
380     /***
381      * Constant: the attribute was declared #FIXED.
382      * @see #getAttributeDefaultValueType
383      * @see #getAttributeDefaultValue
384      */
385     public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
386 
387 
388     //
389     // Constants for input.
390     //
391     private final static int INPUT_NONE = 0;
392     private final static int INPUT_INTERNAL = 1;
393     private final static int INPUT_EXTERNAL = 2;
394     private final static int INPUT_STREAM = 3;
395     private final static int INPUT_BUFFER = 4;
396     private final static int INPUT_READER = 5;
397 
398 
399     //
400     // Flags for reading literals.
401     //
402     // expand general entity refs (attribute values in dtd and content)
403     private final static int LIT_ENTITY_REF = 2;
404     // normalize this value (whitespace etc) (attributes, public ids)
405     private final static int LIT_NORMALIZE = 4;
406     // literal is an attribute value 
407     private final static int LIT_ATTRIBUTE = 8;
408     // don't expand parameter entities
409     private final static int LIT_DISABLE_PE = 16;
410     // don't expand [or parse] character refs
411     private final static int LIT_DISABLE_CREF = 32;
412     // don't parse general entity refs
413     private final static int LIT_DISABLE_EREF = 64;
414     // don't expand general entities, but make sure we _could_
415     private final static int LIT_ENTITY_CHECK = 128;
416 
417 
418     //
419     // Flags affecting PE handling in DTDs (if expandPE is true).
420     // PEs expand with space padding, except inside literals.
421     //
422     private final static int CONTEXT_NORMAL = 0;
423     private final static int CONTEXT_LITERAL = 1;
424 
425 
426     //////////////////////////////////////////////////////////////////////
427     // Error reporting.
428     //////////////////////////////////////////////////////////////////////
429 
430 
431     /***
432      * Report an error.
433      * @param message The error message.
434      * @param textFound The text that caused the error (or null).
435      * @see SAXDriver#error
436      * @see #line
437      */
438     private void error (String message, String textFound, String textExpected)
439     throws SAXException
440     {
441     if (textFound != null) {
442         message = message + " (found \"" + textFound + "\")";
443     }
444     if (textExpected != null) {
445         message = message + " (expected \"" + textExpected + "\")";
446     }
447     String uri = null;
448 
449     if (externalEntity != null) {
450         uri = externalEntity.getURL ().toString ();
451     }
452     handler.error (message, uri, line, column);
453 
454     // "can't happen"
455     throw new SAXException (message);
456     }
457 
458 
459     /***
460      * Report a serious error.
461      * @param message The error message.
462      * @param textFound The text that caused the error (or null).
463      */
464     private void error (String message, char textFound, String textExpected)
465     throws SAXException
466     {
467     error (message, new Character (textFound).toString (), textExpected);
468     }
469 
470     /*** Report typical case fatal errors. */
471     private void error (String message)
472     throws SAXException
473     {
474     error (message, null, null);
475     }
476 
477 
478     //////////////////////////////////////////////////////////////////////
479     // Major syntactic productions.
480     //////////////////////////////////////////////////////////////////////
481 
482 
483     /***
484      * Parse an XML document.
485      * <pre>
486      * [1] document ::= prolog element Misc*
487      * </pre>
488      * <p>This is the top-level parsing function for a single XML
489      * document.  As a minimum, a well-formed document must have
490      * a document element, and a valid document must have a prolog
491      * (one with doctype) as well.
492      */
493     private void parseDocument ()
494     throws Exception
495     {
496         char c;
497         try {                                       // added by MHK
498             parseProlog ();
499             require ('<');
500             parseElement ();
501         } catch (EOFException ee) {                 // added by MHK
502             error("premature end of file", "[EOF]", null);
503         }
504         
505         try {
506             parseMisc ();   //skip all white, PIs, and comments
507             c = readCh ();    //if this doesn't throw an exception...
508             error ("unexpected characters after document end", c, null);
509         } catch (EOFException e) {
510             return;
511         }
512     }
513 
514 
515     /***
516      * Skip a comment.
517      * <pre>
518      * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
519      * </pre>
520      * <p> (The <code>&lt;!--</code> has already been read.)
521      */
522     private void parseComment ()
523     throws Exception
524     {
525     char c;
526     boolean saved = expandPE;
527 
528     expandPE = false;
529     parseUntil ("--");
530     require ('>');
531     expandPE = saved;
532     handler.comment (dataBuffer, 0, dataBufferPos);
533     dataBufferPos = 0;
534     }
535 
536 
537     /***
538      * Parse a processing instruction and do a call-back.
539      * <pre>
540      * [16] PI ::= '&lt;?' PITarget
541      *      (S (Char* - (Char* '?&gt;' Char*)))?
542      *      '?&gt;'
543      * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
544      * </pre>
545      * <p> (The <code>&lt;?</code> has already been read.)
546      */
547     private void parsePI ()
548     throws SAXException, IOException
549     {
550     String name;
551     boolean saved = expandPE;
552 
553     expandPE = false;
554     name = readNmtoken (true);
555     if ("xml".equalsIgnoreCase (name))
556         error ("Illegal processing instruction target", name, null);
557     if (!tryRead ("?>")) {
558         requireWhitespace ();
559         parseUntil ("?>");
560     }
561     expandPE = saved;
562     handler.processingInstruction (name, dataBufferToString ());
563     }
564 
565 
566     /***
567      * Parse a CDATA section.
568      * <pre>
569      * [18] CDSect ::= CDStart CData CDEnd
570      * [19] CDStart ::= '&lt;![CDATA['
571      * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
572      * [21] CDEnd ::= ']]&gt;'
573      * </pre>
574      * <p> (The '&lt;![CDATA[' has already been read.)
575      */
576     private void parseCDSect ()
577     throws Exception
578     {
579     parseUntil ("]]>");
580     dataBufferFlush ();
581     }
582 
583 
584     /***
585      * Parse the prolog of an XML document.
586      * <pre>
587      * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
588      * </pre>
589      * <p>There are a couple of tricks here.  First, it is necessary to
590      * declare the XML default attributes after the DTD (if present)
591      * has been read. [??]  Second, it is not possible to expand general
592      * references in attribute value literals until after the entire
593      * DTD (if present) has been parsed.
594      * <p>We do not look for the XML declaration here, because it was
595      * handled by pushURL ().
596      * @see pushURL
597      */
598     private void parseProlog ()
599     throws Exception
600     {
601     parseMisc ();
602 
603     if (tryRead ("<!DOCTYPE")) {
604         parseDoctypedecl ();
605         parseMisc ();
606     }
607     }
608 
609 
610     /***
611      * Parse the XML declaration.
612      * <pre>
613      * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
614      * [24] VersionInfo ::= S 'version' Eq
615      *      ("'" VersionNum "'" | '"' VersionNum '"' )
616      * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
617      * [32] SDDecl ::= S 'standalone' Eq
618      *      ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
619      * [80] EncodingDecl ::= S 'encoding' Eq
620      *      ( "'" EncName "'" | "'" EncName "'" )
621      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
622      * </pre>
623      * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
624      * @return the encoding in the declaration, uppercased; or null
625      * @see #parseTextDecl
626      * @see #setupDecoding
627      */
628     private String parseXMLDecl (boolean ignoreEncoding)
629     throws SAXException, IOException
630     {
631     String  version;
632     String  encodingName = null;
633     String  standalone = null;
634     boolean white;
635     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
636 
637     // Read the version.
638     require ("version");
639     parseEq ();
640     version = readLiteral (flags);
641     if (!version.equals ("1.0")) {
642         error ("unsupported XML version", version, "1.0");
643     }
644 
645     // Try reading an encoding declaration.
646     white = tryWhitespace ();
647     if (tryRead ("encoding")) {
648         if (!white)
649         error ("whitespace required before 'encoding='");
650         parseEq ();
651         encodingName = readLiteral (flags);
652         if (!ignoreEncoding)
653         setupDecoding (encodingName);
654     }
655 
656     // Try reading a standalone declaration
657     if (encodingName != null)
658         white = tryWhitespace ();
659     if (tryRead ("standalone")) {
660         if (!white)
661         error ("whitespace required before 'standalone='");
662         parseEq ();
663         standalone = readLiteral (flags);
664         if (! ("yes".equals (standalone) || "no".equals (standalone)))
665         error ("standalone flag must be 'yes' or 'no'");
666     }
667 
668     skipWhitespace ();
669     require ("?>");
670 
671     return encodingName;
672     }
673 
674 
675     /***
676      * Parse a text declaration.
677      * <pre>
678      * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
679      * [80] EncodingDecl ::= S 'encoding' Eq
680      *      ( '"' EncName '"' | "'" EncName "'" )
681      * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
682      * </pre>
683      * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
684      * @return the encoding in the declaration, uppercased; or null
685      * @see #parseXMLDecl
686      * @see #setupDecoding
687      */
688     private String parseTextDecl (boolean ignoreEncoding)
689     throws SAXException, IOException
690     {
691     String  encodingName = null;
692     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
693 
694     // Read an optional version.
695     if (tryRead ("version")) {
696         String version;
697         parseEq ();
698         version = readLiteral (flags);
699         if (!version.equals ("1.0")) {
700         error ("unsupported XML version", version, "1.0");
701         }
702         requireWhitespace ();
703     }
704 
705 
706     // Read the encoding.
707     require ("encoding");
708     parseEq ();
709     encodingName = readLiteral (flags);
710     if (!ignoreEncoding)
711         setupDecoding (encodingName);
712 
713     skipWhitespace ();
714     require ("?>");
715 
716     return encodingName;
717     }
718 
719 
720     /***
721      * Sets up internal state so that we can decode an entity using the
722      * specified encoding.  This is used when we start to read an entity
723      * and we have been given knowledge of its encoding before we start to
724      * read any data (e.g. from a SAX input source or from a MIME type).
725      *
726      * <p> It is also used after autodetection, at which point only very
727      * limited adjustments to the encoding may be used (switching between
728      * related builtin decoders).
729      *
730      * @param encodingName The name of the encoding specified by the user.
731      * @exception IOException if the encoding isn't supported either
732      *  internally to this parser, or by the hosting JVM.
733      * @see #parseXMLDecl
734      * @see #parseTextDecl
735      */
736     private void setupDecoding (String encodingName)
737     throws SAXException, IOException
738     {
739     encodingName = encodingName.toUpperCase ();
740 
741     // ENCODING_EXTERNAL indicates an encoding that wasn't
742     // autodetected ... we can use builtin decoders, or
743     // ones from the JVM (InputStreamReader).
744 
745     // Otherwise we can only tweak what was autodetected, and
746     // only for single byte (ASCII derived) builtin encodings.
747 
748     // ASCII-derived encodings
749     if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) {
750         if (encodingName.equals ("ISO-8859-1")
751             || encodingName.equals ("8859_1")
752             || encodingName.equals ("ISO8859_1")
753           ) {
754         encoding = ENCODING_ISO_8859_1;
755         return;
756         } else if (encodingName.equals ("US-ASCII")
757             || encodingName.equals ("ASCII")) {
758         encoding = ENCODING_ASCII;
759         return;
760         } else if (encodingName.equals ("UTF-8")
761             || encodingName.equals ("UTF8")) {
762         encoding = ENCODING_UTF_8;
763         return;
764         } else if (encoding != ENCODING_EXTERNAL) {
765         // fatal error
766         error ("unsupported ASCII-derived encoding",
767                encodingName,
768                "UTF-8, US-ASCII, or ISO-8859-1");
769         }
770         // else fallthrough ...
771         // it's ASCII-ish and something other than a builtin
772     }
773 
774     // Unicode and such
775     if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) {
776         if (!(encodingName.equals ("ISO-10646-UCS-2")
777             || encodingName.equals ("UTF-16")
778             || encodingName.equals ("UTF-16BE")
779             || encodingName.equals ("UTF-16LE")))
780         error ("unsupported Unicode encoding",
781                encodingName,
782                "UTF-16");
783         return;
784     }
785 
786     // four byte encodings
787     if (encoding == ENCODING_UCS_4_1234
788         || encoding == ENCODING_UCS_4_4321
789         || encoding == ENCODING_UCS_4_2143
790         || encoding == ENCODING_UCS_4_3412) {
791         if (!encodingName.equals ("ISO-10646-UCS-4"))
792         error ("unsupported 32-bit encoding",
793                encodingName,
794                "ISO-10646-UCS-4");
795         return;
796     }
797 
798     // assert encoding == ENCODING_EXTERNAL
799     // if (encoding != ENCODING_EXTERNAL)
800     //     throw new RuntimeException ("encoding = " + encoding);
801 
802     if (encodingName.equals ("UTF-16BE")) {
803         encoding = ENCODING_UCS_2_12;
804         return;
805     }
806     if (encodingName.equals ("UTF-16LE")) {
807         encoding = ENCODING_UCS_2_21;
808         return;
809     }
810 
811     // We couldn't use the builtin decoders at all.  But we can try to
812     // create a reader, since we haven't messed up buffering.  Tweak
813     // the encoding name if necessary.
814 
815     if (encodingName.equals ("UTF-16")
816         || encodingName.equals ("ISO-10646-UCS-2"))
817         encodingName = "Unicode";
818     // Ignoring all the EBCDIC aliases here
819 
820     reader = new InputStreamReader (is, encodingName);
821     sourceType = INPUT_READER;
822     is = null;
823     }
824 
825 
826     /***
827      * Parse miscellaneous markup outside the document element and DOCTYPE
828      * declaration.
829      * <pre>
830      * [27] Misc ::= Comment | PI | S
831      * </pre>
832      */
833     private void parseMisc ()
834     throws Exception
835     {
836     while (true) {
837         skipWhitespace ();
838         if (tryRead ("<?")) {
839         parsePI ();
840         } else if (tryRead ("<!--")) {
841         parseComment ();
842         } else {
843         return;
844         }
845     }
846     }
847 
848 
849     /***
850      * Parse a document type declaration.
851      * <pre>
852      * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
853      *      ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
854      * </pre>
855      * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
856      */
857     private void parseDoctypedecl ()
858     throws Exception
859     {
860     char c;
861     String doctypeName, ids[];
862 
863     // Read the document type name.
864     requireWhitespace ();
865     doctypeName = readNmtoken (true);
866 
867     // Read the External subset's IDs
868     skipWhitespace ();
869     ids = readExternalIds (false);
870 
871     // report (a) declaration of name, (b) lexical info (ids)
872     handler.doctypeDecl (doctypeName, ids [0], ids [1]);
873 
874     // Internal subset is parsed first, if present
875     skipWhitespace ();
876     if (tryRead ('[')) {
877 
878         // loop until the subset ends
879         while (true) {
880         expandPE = true;
881         skipWhitespace ();
882         expandPE = false;
883         if (tryRead (']')) {
884             break;      // end of subset
885         } else {
886             // WFC, PEs in internal subset (only between decls)
887             peIsError = expandPE = true;
888             parseMarkupdecl ();
889             peIsError = expandPE = false;
890         }
891         }
892     }
893 
894     // Read the external subset, if any
895     if (ids [1] != null) {
896         pushURL ("[external subset]", ids [0], ids [1], null, null, null);
897 
898         // Loop until we end up back at '>'
899         while (true) {
900         expandPE = true;
901         skipWhitespace ();
902         expandPE = false;
903         if (tryRead ('>')) {
904             break;
905         } else {
906             expandPE = true;
907             parseMarkupdecl ();
908             expandPE = false;
909         }
910         }
911     } else {
912         // No external subset.
913         skipWhitespace ();
914         require ('>');
915     }
916 
917     // done dtd
918     handler.endDoctype ();
919     expandPE = false;
920     }
921 
922 
923     /***
924      * Parse a markup declaration in the internal or external DTD subset.
925      * <pre>
926      * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
927      *      | NotationDecl | PI | Comment
928      * [30] extSubsetDecl ::= (markupdecl | conditionalSect
929      *      | PEReference | S) *
930      * </pre>
931      * <p> Reading toplevel PE references is handled as a lexical issue
932      * by the caller, as is whitespace.
933      */
934     private void parseMarkupdecl ()
935     throws Exception
936     {
937     if (tryRead ("<!ELEMENT")) {
938         parseElementdecl ();
939     } else if (tryRead ("<!ATTLIST")) {
940         parseAttlistDecl ();
941     } else if (tryRead ("<!ENTITY")) {
942         parseEntityDecl ();
943     } else if (tryRead ("<!NOTATION")) {
944         parseNotationDecl ();
945     } else if (tryRead ("<?")) {
946         parsePI ();
947     } else if (tryRead ("<!--")) {
948         parseComment ();
949     } else if (tryRead ("<![")) {
950         if (inputStack.size () > 0)
951         parseConditionalSect ();
952         else
953         error ("conditional sections illegal in internal subset");
954     } else {
955         error ("expected markup declaration");
956     }
957     }
958 
959 
960     /***
961      * Parse an element, with its tags.
962      * <pre>
963      * [39] element ::= EmptyElementTag | STag content ETag
964      * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
965      * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
966      * </pre>
967      * <p> (The '&lt;' has already been read.)
968      * <p>NOTE: this method actually chains onto parseContent (), if necessary,
969      * and parseContent () will take care of calling parseETag ().
970      */
971     private void parseElement ()
972     throws Exception
973     {
974     String  gi;
975     char    c;
976     int oldElementContent = currentElementContent;
977     String  oldElement = currentElement;
978     Object  element [];
979 
980     // This is the (global) counter for the
981     // array of specified attributes.
982     tagAttributePos = 0;
983 
984     // Read the element type name.
985     gi = readNmtoken (true);
986 
987     // Determine the current content type.
988     currentElement = gi;
989     element = (Object []) elementInfo.get (gi);
990     currentElementContent = getContentType (element, CONTENT_ANY);
991 
992     // Read the attributes, if any.
993     // After this loop, "c" is the closing delimiter.
994     boolean white = tryWhitespace ();
995     c = readCh ();
996     while (c != '/' && c != '>') {
997         unread (c);
998         if (!white)
999         error ("need whitespace between attributes");
1000         parseAttribute (gi);
1001         white = tryWhitespace ();
1002         c = readCh ();
1003     }
1004 
1005     // Supply any defaulted attributes.
1006     Iterator atts = declaredAttributes (element);
1007     if (atts != null) {
1008         String aname;
1009 loop:
1010         while (atts.hasNext ()) {
1011         aname = (String) atts.next ();
1012         // See if it was specified.
1013         for (int i = 0; i < tagAttributePos; i++) {
1014             if (tagAttributes [i] == aname) {
1015             continue loop;
1016             }
1017         }
1018         // I guess not...
1019         handler.attribute (aname,
1020                    getAttributeExpandedValue (gi, aname),
1021                    false);
1022         }
1023     }
1024 
1025     // Figure out if this is a start tag
1026     // or an empty element, and dispatch an
1027     // event accordingly.
1028     switch (c) {
1029     case '>':
1030         handler.startElement (gi);
1031         parseContent ();
1032         break;
1033     case '/':
1034         require ('>');
1035         handler.startElement (gi);
1036         handler.endElement (gi);
1037         break;
1038     }
1039 
1040     // Restore the previous state.
1041     currentElement = oldElement;
1042     currentElementContent = oldElementContent;
1043     }
1044 
1045 
1046     /***
1047      * Parse an attribute assignment.
1048      * <pre>
1049      * [41] Attribute ::= Name Eq AttValue
1050      * </pre>
1051      * @param name The name of the attribute's element.
1052      * @see SAXDriver#attribute
1053      */
1054     private void parseAttribute (String name)
1055     throws Exception
1056     {
1057     String aname;
1058     int type;
1059     String value;
1060     int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1061 
1062     // Read the attribute name.
1063     aname = readNmtoken (true);
1064     type = getAttributeType (name, aname);
1065 
1066     // Parse '='
1067     parseEq ();
1068 
1069     // Read the value, normalizing whitespace
1070     // unless it is CDATA.
1071     if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
1072         value = readLiteral (flags);
1073     } else {
1074         value = readLiteral (flags | LIT_NORMALIZE);
1075     }
1076 
1077     // WFC: no duplicate attributes
1078     for (int i = 0; i < tagAttributePos; i++)
1079         if (aname.equals (tagAttributes [i]))
1080         error ("duplicate attribute", aname, null);
1081 
1082     // Inform the handler about the
1083     // attribute.
1084     handler.attribute (aname, value, true);
1085     dataBufferPos = 0;
1086 
1087     // Note that the attribute has been
1088     // specified.
1089     if (tagAttributePos == tagAttributes.length) {
1090         String newAttrib[] = new String [tagAttributes.length * 2];
1091         System.arraycopy (tagAttributes, 0, newAttrib, 0, tagAttributePos);
1092         tagAttributes = newAttrib;
1093     }
1094     tagAttributes [tagAttributePos++] = aname;
1095     }
1096 
1097 
1098     /***
1099      * Parse an equals sign surrounded by optional whitespace.
1100      * <pre>
1101      * [25] Eq ::= S? '=' S?
1102      * </pre>
1103      */
1104     private void parseEq ()
1105     throws SAXException, IOException
1106     {
1107     skipWhitespace ();
1108     require ('=');
1109     skipWhitespace ();
1110     }
1111 
1112 
1113     /***
1114      * Parse an end tag.
1115      * <pre>
1116      * [42] ETag ::= '</' Name S? '>'
1117      * </pre>
1118      * <p>NOTE: parseContent () chains to here, we already read the
1119      * "&lt;/".
1120      */
1121     private void parseETag ()
1122     throws Exception
1123     {
1124     require (currentElement);
1125     skipWhitespace ();
1126     require ('>');
1127     handler.endElement (currentElement);
1128     // not re-reporting any SAXException re bogus end tags,
1129     // even though that diagnostic might be clearer ...
1130     }
1131 
1132 
1133     /***
1134      * Parse the content of an element.
1135      * <pre>
1136      * [43] content ::= (element | CharData | Reference
1137      *      | CDSect | PI | Comment)*
1138      * [67] Reference ::= EntityRef | CharRef
1139      * </pre>
1140      * <p> NOTE: consumes ETtag.
1141      */
1142     private void parseContent ()
1143     throws Exception
1144     {
1145     String data;
1146     char c;
1147 
1148     while (true) {
1149         switch (currentElementContent) {
1150             case CONTENT_ANY:
1151             case CONTENT_MIXED:
1152             case CONTENT_UNDECLARED:    // this line added by MHK 24 May 2000
1153             case CONTENT_EMPTY:         // this line added by MHK 8 Sept 2000
1154                 parseCharData ();
1155                 break;
1156             case CONTENT_ELEMENTS:
1157                 parseWhitespace ();
1158                 break;
1159         }
1160 
1161         // Handle delimiters
1162         c = readCh ();
1163         switch (c) {
1164 
1165         case '&':           // Found "&"
1166 
1167             c = readCh ();
1168             if (c == '#') {
1169                 parseCharRef ();
1170             } else {
1171                 unread (c);
1172                 parseEntityRef (true);
1173             }
1174             break;
1175 
1176         case '<':           // Found "<"
1177             dataBufferFlush ();
1178             c = readCh ();
1179             switch (c) {
1180               case '!':             // Found "<!"
1181                 c = readCh ();
1182                 switch (c) {
1183                   case '-':         // Found "<!-"
1184                     require ('-');
1185                     parseComment ();
1186                     break;
1187                   case '[':         // Found "<!["
1188                     require ("CDATA[");
1189                     handler.startCDATA ();
1190                     inCDATA = true;
1191                     parseCDSect ();
1192                     inCDATA = false;
1193                     handler.endCDATA ();
1194                     break;
1195                   default:
1196                     error ("expected comment or CDATA section", c, null);
1197                     break;
1198                 }
1199                 break;
1200 
1201               case '?':         // Found "<?"
1202                 parsePI ();
1203                 break;
1204 
1205               case '/':         // Found "</"
1206                 parseETag ();
1207                 return;
1208 
1209               default:      // Found "<" followed by something else
1210                 unread (c);
1211                 parseElement ();
1212                 break;
1213             }
1214             }
1215         }
1216     }
1217 
1218 
1219     /***
1220      * Parse an element type declaration.
1221      * <pre>
1222      * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1223      * </pre>
1224      * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1225      */
1226     private void parseElementdecl ()
1227     throws Exception
1228     {
1229     String name;
1230 
1231     requireWhitespace ();
1232     // Read the element type name.
1233     name = readNmtoken (true);
1234 
1235     requireWhitespace ();
1236     // Read the content model.
1237     parseContentspec (name);
1238 
1239     skipWhitespace ();
1240     require ('>');
1241     }
1242 
1243 
1244     /***
1245      * Content specification.
1246      * <pre>
1247      * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1248      * </pre>
1249      */
1250     private void parseContentspec (String name)
1251     throws Exception
1252     {
1253     if (tryRead ("EMPTY")) {
1254         setElement (name, CONTENT_EMPTY, null, null);
1255         return;
1256     } else if (tryRead ("ANY")) {
1257         setElement (name, CONTENT_ANY, null, null);
1258         return;
1259     } else {
1260         require ('(');
1261         dataBufferAppend ('(');
1262         skipWhitespace ();
1263         if (tryRead ("#PCDATA")) {
1264         dataBufferAppend ("#PCDATA");
1265         parseMixed ();
1266         setElement (name, CONTENT_MIXED, dataBufferToString (), null);
1267         } else {
1268         parseElements ();
1269         setElement (name, CONTENT_ELEMENTS,
1270             dataBufferToString (), null);
1271         }
1272     }
1273     }
1274 
1275 
1276     /***
1277      * Parse an element-content model.
1278      * <pre>
1279      * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1280      * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1281      * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1282      * </pre>
1283      *
1284      * <p> NOTE: the opening '(' and S have already been read.
1285      */
1286     private void parseElements ()
1287     throws Exception
1288     {
1289     char c;
1290     char sep;
1291 
1292     // Parse the first content particle
1293     skipWhitespace ();
1294     parseCp ();
1295 
1296     // Check for end or for a separator.
1297     skipWhitespace ();
1298     c = readCh ();
1299     switch (c) {
1300     case ')':
1301         dataBufferAppend (')');
1302         c = readCh ();
1303         switch (c) {
1304         case '*':
1305         case '+':
1306         case '?':
1307         dataBufferAppend (c);
1308         break;
1309         default:
1310         unread (c);
1311         }
1312         return;
1313     case ',':           // Register the separator.
1314     case '|':
1315         sep = c;
1316         dataBufferAppend (c);
1317         break;
1318     default:
1319         error ("bad separator in content model", c, null);
1320         return;
1321     }
1322 
1323     // Parse the rest of the content model.
1324     while (true) {
1325         skipWhitespace ();
1326         parseCp ();
1327         skipWhitespace ();
1328         c = readCh ();
1329         if (c == ')') {
1330         dataBufferAppend (')');
1331         break;
1332         } else if (c != sep) {
1333         error ("bad separator in content model", c, null);
1334         return;
1335         } else {
1336         dataBufferAppend (c);
1337         }
1338     }
1339 
1340     // Check for the occurrence indicator.
1341     c = readCh ();
1342     switch (c) {
1343     case '?':
1344     case '*':
1345     case '+':
1346         dataBufferAppend (c);
1347         return;
1348     default:
1349         unread (c);
1350         return;
1351     }
1352     }
1353 
1354 
1355     /***
1356      * Parse a content particle.
1357      * <pre>
1358      * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1359      * </pre>
1360      */
1361     private void parseCp ()
1362     throws Exception
1363     {
1364     char c;
1365 
1366     if (tryRead ('(')) {
1367         dataBufferAppend ('(');
1368         parseElements ();
1369     } else {
1370         dataBufferAppend (readNmtoken (true));
1371         c = readCh ();
1372         switch (c) {
1373         case '?':
1374         case '*':
1375         case '+':
1376         dataBufferAppend (c);
1377         break;
1378         default:
1379         unread (c);
1380         break;
1381         }
1382     }
1383     }
1384 
1385 
1386     /***
1387      * Parse mixed content.
1388      * <pre>
1389      * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1390      *        | '(' S? ('#PCDATA') S? ')'
1391      * </pre>
1392      */
1393     private void parseMixed ()
1394     throws Exception
1395     {
1396     char c;
1397 
1398     // Check for PCDATA alone.
1399     skipWhitespace ();
1400     if (tryRead (')')) {
1401         dataBufferAppend (")");
1402         if (tryRead ('*')) {
1403             dataBufferAppend("*");
1404         }
1405         return;
1406     }
1407 
1408     // Parse mixed content.
1409     skipWhitespace ();
1410     while (!tryRead (")*")) {
1411         require ('|');
1412         dataBufferAppend ('|');
1413         skipWhitespace ();
1414         dataBufferAppend (readNmtoken (true));
1415         skipWhitespace ();
1416     }
1417     dataBufferAppend (")*");
1418     }
1419 
1420 
1421     /***
1422      * Parse an attribute list declaration.
1423      * <pre>
1424      * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1425      * </pre>
1426      * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1427      */
1428     private void parseAttlistDecl ()
1429     throws Exception
1430     {
1431     String elementName;
1432 
1433     requireWhitespace ();
1434     elementName = readNmtoken (true);
1435     boolean white = tryWhitespace ();
1436     while (!tryRead ('>')) {
1437         if (!white)
1438         error ("whitespace required before attribute definition");
1439         parseAttDef (elementName);
1440         white = tryWhitespace ();
1441     }
1442     }
1443 
1444 
1445     /***
1446      * Parse a single attribute definition.
1447      * <pre>
1448      * [53] AttDef ::= S Name S AttType S DefaultDecl
1449      * </pre>
1450      */
1451     private void parseAttDef (String elementName)
1452     throws Exception
1453     {
1454     String name;
1455     int type;
1456     String enumer = null;
1457 
1458     // Read the attribute name.
1459     name = readNmtoken (true);
1460 
1461     // Read the attribute type.
1462     requireWhitespace ();
1463     type = readAttType ();
1464 
1465     // Get the string of enumerated values
1466     // if necessary.
1467     if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1468         enumer = dataBufferToString ();
1469     }
1470 
1471     // Read the default value.
1472     requireWhitespace ();
1473     parseDefault (elementName, name, type, enumer);
1474     }
1475 
1476 
1477     /***
1478      * Parse the attribute type.
1479      * <pre>
1480      * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1481      * [55] StringType ::= 'CDATA'
1482      * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1483      *      | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1484      * [57] EnumeratedType ::= NotationType | Enumeration
1485      * </pre>
1486      */
1487     private int readAttType ()
1488     throws Exception
1489     {
1490     String typeString;
1491     Integer type;
1492 
1493     if (tryRead ('(')) {
1494         parseEnumeration (false);
1495         return ATTRIBUTE_ENUMERATED;
1496     } else {
1497         typeString = readNmtoken (true);
1498         if (typeString.equals ("NOTATION")) {
1499         parseNotationType ();
1500         }
1501         type = (Integer) attributeTypeHash.get (typeString);
1502         if (type == null) {
1503         error ("illegal attribute type", typeString, null);
1504         return ATTRIBUTE_UNDECLARED;
1505         } else {
1506         return type.intValue ();
1507         }
1508     }
1509     }
1510 
1511 
1512     /***
1513      * Parse an enumeration.
1514      * <pre>
1515      * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1516      * </pre>
1517      * <p>NOTE: the '(' has already been read.
1518      */
1519     private void parseEnumeration (boolean isNames)
1520     throws Exception
1521     {
1522     char c;
1523 
1524     dataBufferAppend ('(');
1525 
1526     // Read the first token.
1527     skipWhitespace ();
1528     dataBufferAppend (readNmtoken (isNames));
1529     // Read the remaining tokens.
1530     skipWhitespace ();
1531     while (!tryRead (')')) {
1532         require ('|');
1533         dataBufferAppend ('|');
1534         skipWhitespace ();
1535         dataBufferAppend (readNmtoken (isNames));
1536         skipWhitespace ();
1537     }
1538     dataBufferAppend (')');
1539     }
1540 
1541 
1542     /***
1543      * Parse a notation type for an attribute.
1544      * <pre>
1545      * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
1546      *      (S? '|' S? name)* S? ')'
1547      * </pre>
1548      * <p>NOTE: the 'NOTATION' has already been read
1549      */
1550     private void parseNotationType ()
1551     throws Exception
1552     {
1553     requireWhitespace ();
1554     require ('(');
1555 
1556     parseEnumeration (true);
1557     }
1558 
1559 
1560     /***
1561      * Parse the default value for an attribute.
1562      * <pre>
1563      * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1564      *      | (('#FIXED' S)? AttValue)
1565      * </pre>
1566      */
1567     private void parseDefault (
1568     String elementName,
1569     String name,
1570     int type,
1571     String enumer
1572     ) throws Exception
1573     {
1574     int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1575     String  value = null;
1576     int flags = LIT_ATTRIBUTE | LIT_DISABLE_CREF | LIT_ENTITY_CHECK;
1577 
1578     // Note: char refs not checked here, and input not normalized,
1579     // since it's done correctly later when we actually expand any
1580     // entity refs.  We ought to report char ref syntax errors now,
1581     // but don't.  Cost: unused defaults mean unreported WF errs.
1582     
1583     // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
1584     // chars to spaces (doesn't matter when that's done if it doesn't
1585     // interfere with char refs expanding to whitespace).
1586 
1587     if (tryRead ('#')) {
1588         if (tryRead ("FIXED")) {
1589         valueType = ATTRIBUTE_DEFAULT_FIXED;
1590         requireWhitespace ();
1591         value = readLiteral (flags);
1592         } else if (tryRead ("REQUIRED")) {
1593         valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1594         } else if (tryRead ("IMPLIED")) {
1595         valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1596         } else {
1597         error ("illegal keyword for attribute default value");
1598         }
1599     } else
1600         value = readLiteral (flags);
1601     setAttribute (elementName, name, type, enumer, value, valueType);
1602     }
1603 
1604 
1605     /***
1606      * Parse a conditional section.
1607      * <pre>
1608      * [61] conditionalSect ::= includeSect || ignoreSect
1609      * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
1610      *      extSubsetDecl ']]&gt;'
1611      * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
1612      *      ignoreSectContents* ']]&gt;'
1613      * [64] ignoreSectContents ::= Ignore
1614      *      ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
1615      * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
1616      * </pre>
1617      * <p> NOTE: the '&gt;![' has already been read.
1618      */
1619     private void parseConditionalSect ()
1620     throws Exception
1621     {
1622     skipWhitespace ();
1623     if (tryRead ("INCLUDE")) {
1624         skipWhitespace ();
1625         require ('[');
1626         skipWhitespace ();
1627         while (!tryRead ("]]>")) {
1628         parseMarkupdecl ();
1629         skipWhitespace ();
1630         }
1631     } else if (tryRead ("IGNORE")) {
1632         skipWhitespace ();
1633         require ('[');
1634         int nesting = 1;
1635         char c;
1636         expandPE = false;
1637         for (int nest = 1; nest > 0;) {
1638         c = readCh ();
1639         switch (c) {
1640         case '<':
1641             if (tryRead ("![")) {
1642             nest++;
1643             }
1644         case ']':
1645             if (tryRead ("]>")) {
1646             nest--;
1647             }
1648         }
1649         }
1650         expandPE = true;
1651     } else {
1652         error ("conditional section must begin with INCLUDE or IGNORE");
1653     }
1654     }
1655 
1656 
1657     /***
1658      * Read and interpret a character reference.
1659      * <pre>
1660      * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1661      * </pre>
1662      * <p>NOTE: the '&#' has already been read.
1663      */
1664     private void parseCharRef ()
1665     throws SAXException, IOException
1666     {
1667     int value = 0;
1668     char c;
1669 
1670     if (tryRead ('x')) {
1671 loop1:
1672         while (true) {
1673         c = readCh ();
1674         switch (c) {
1675         case '0':
1676         case '1':
1677         case '2':
1678         case '3':
1679         case '4':
1680         case '5':
1681         case '6':
1682         case '7':
1683         case '8':
1684         case '9':
1685         case 'a':
1686         case 'A':
1687         case 'b':
1688         case 'B':
1689         case 'c':
1690         case 'C':
1691         case 'd':
1692         case 'D':
1693         case 'e':
1694         case 'E':
1695         case 'f':
1696         case 'F':
1697             value *= 16;
1698             value += Integer.parseInt (new Character (c).toString (),
1699                     16);
1700             break;
1701         case ';':
1702             break loop1;
1703         default:
1704             error ("illegal character in character reference", c, null);
1705             break loop1;
1706         }
1707         }
1708     } else {
1709 loop2:
1710         while (true) {
1711         c = readCh ();
1712         switch (c) {
1713         case '0':
1714         case '1':
1715         case '2':
1716         case '3':
1717         case '4':
1718         case '5':
1719         case '6':
1720         case '7':
1721         case '8':
1722         case '9':
1723             value *= 10;
1724             value += Integer.parseInt (new Character (c).toString (),
1725                     10);
1726             break;
1727         case ';':
1728             break loop2;
1729         default:
1730             error ("illegal character in character reference", c, null);
1731             break loop2;
1732         }
1733         }
1734     }
1735 
1736     // check for character refs being legal XML
1737     if ((value < 0x0020
1738         && ! (value == '\n' || value == '\t' || value == '\r'))
1739         || (value >= 0xD800 && value <= 0xDFFF)
1740         || value == 0xFFFE || value == 0xFFFF
1741         || value > 0x0010ffff)
1742         error ("illegal XML character reference U+"
1743             + Integer.toHexString (value));
1744 
1745     // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1746     //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1747     if (value <= 0x0000ffff) {
1748         // no surrogates needed
1749         dataBufferAppend ((char) value);
1750     } else if (value <= 0x0010ffff) {
1751         value -= 0x10000;
1752         // > 16 bits, surrogate needed
1753         dataBufferAppend ((char) (0xd800 | (value >> 10)));
1754         dataBufferAppend ((char) (0xdc00 | (value & 0x0003ff)));
1755     } else {
1756         // too big for surrogate
1757         error ("character reference " + value + " is too large for UTF-16",
1758            new Integer (value).toString (), null);
1759     }
1760     }
1761 
1762 
1763     /***
1764      * Parse and expand an entity reference.
1765      * <pre>
1766      * [68] EntityRef ::= '&' Name ';'
1767      * </pre>
1768      * <p>NOTE: the '&amp;' has already been read.
1769      * @param externalAllowed External entities are allowed here.
1770      */
1771     private void parseEntityRef (boolean externalAllowed)
1772     throws SAXException, IOException
1773     {
1774     String name;
1775 
1776     name = readNmtoken (true);
1777     require (';');
1778     switch (getEntityType (name)) {
1779     case ENTITY_UNDECLARED:
1780         error ("reference to undeclared entity", name, null);
1781         break;
1782     case ENTITY_INTERNAL:
1783         pushString (name, getEntityValue (name));
1784         break;
1785     case ENTITY_TEXT:
1786         if (externalAllowed) {
1787         pushURL (name, getEntityPublicId (name),
1788              getEntitySystemId (name),
1789              null, null, null);
1790         } else {
1791         error ("reference to external entity in attribute value.",
1792             name, null);
1793         }
1794         break;
1795     case ENTITY_NDATA:
1796         if (externalAllowed) {
1797         error ("unparsed entity reference in content", name, null);
1798         } else {
1799         error ("reference to external entity in attribute value.",
1800             name, null);
1801         }
1802         break;
1803     }
1804     }
1805 
1806 
1807     /***
1808      * Parse and expand a parameter entity reference.
1809      * <pre>
1810      * [69] PEReference ::= '%' Name ';'
1811      * </pre>
1812      * <p>NOTE: the '%' has already been read.
1813      */
1814     private void parsePEReference ()
1815     throws SAXException, IOException
1816     {
1817     String name;
1818 
1819     name = "%" + readNmtoken (true);
1820     require (';');
1821     switch (getEntityType (name)) {
1822     case ENTITY_UNDECLARED:
1823         // this is a validity problem, not a WFC violation ... but
1824         // we should disable handling of all subsequent declarations
1825         // unless this is a standalone document
1826         // warn ("reference to undeclared parameter entity", name, null);
1827 
1828         break;
1829     case ENTITY_INTERNAL:
1830         if (inLiteral)
1831         pushString (name, getEntityValue (name));
1832         else
1833         pushString (name, " " + getEntityValue (name) + ' ');
1834         break;
1835     case ENTITY_TEXT:
1836         if (!inLiteral)
1837         pushString (null, " ");
1838         pushURL (name, getEntityPublicId (name),
1839              getEntitySystemId (name),
1840              null, null, null);
1841         if (!inLiteral)
1842         pushString (null, " ");
1843         break;
1844     }
1845     }
1846 
1847     /***
1848      * Parse an entity declaration.
1849      * <pre>
1850      * [70] EntityDecl ::= GEDecl | PEDecl
1851      * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
1852      * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
1853      * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1854      * [74] PEDef ::= EntityValue | ExternalID
1855      * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1856      *         | 'PUBLIC' S PubidLiteral S SystemLiteral
1857      * [76] NDataDecl ::= S 'NDATA' S Name
1858      * </pre>
1859      * <p>NOTE: the '&lt;!ENTITY' has already been read.
1860      */
1861     private void parseEntityDecl ()
1862     throws Exception
1863     {
1864     char c;
1865     boolean peFlag = false;
1866     String name, value, notationName, ids[];
1867 
1868     // Check for a parameter entity.
1869     expandPE = false;
1870     requireWhitespace ();
1871     if (tryRead ('%')) {
1872         peFlag = true;
1873         requireWhitespace ();
1874     }
1875     expandPE = true;
1876 
1877     // Read the entity name, and prepend
1878     // '%' if necessary.
1879     name = readNmtoken (true);
1880     if (peFlag) {
1881         name = "%" + name;
1882     }
1883 
1884     // Read the entity value.
1885     requireWhitespace ();
1886     c = readCh ();
1887     unread (c);
1888     if (c == '"' || c == '\'') {
1889         // Internal entity ... replacement text has expanded refs
1890         // to characters and PEs, but not to general entities
1891         value = readLiteral (0);
1892         setInternalEntity (name, value);
1893     } else {
1894         // Read the external IDs
1895         ids = readExternalIds (false);
1896         if (ids [1] == null) {
1897         error ("system identifer missing", name, null);
1898         }
1899 
1900         // Check for NDATA declaration.
1901         boolean white = tryWhitespace ();
1902         if (!peFlag && tryRead ("NDATA")) {
1903         if (!white)
1904             error ("whitespace required before NDATA");
1905         requireWhitespace ();
1906         notationName = readNmtoken (true);
1907         setExternalDataEntity (name, ids [0], ids [1], notationName);
1908         } else {
1909         setExternalTextEntity (name, ids [0], ids [1]);
1910         }
1911     }
1912 
1913     // Finish the declaration.
1914     skipWhitespace ();
1915     require ('>');
1916     }
1917 
1918 
1919     /***
1920      * Parse a notation declaration.
1921      * <pre>
1922      * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
1923      *      (ExternalID | PublicID) S? '&gt;'
1924      * [83] PublicID ::= 'PUBLIC' S PubidLiteral
1925      * </pre>
1926      * <P>NOTE: the '&lt;!NOTATION' has already been read.
1927      */
1928     private void parseNotationDecl ()
1929     throws Exception
1930     {
1931     String nname, ids[];
1932 
1933 
1934     requireWhitespace ();
1935     nname = readNmtoken (true);
1936 
1937     requireWhitespace ();
1938 
1939     // Read the external identifiers.
1940     ids = readExternalIds (true);
1941     if (ids [0] == null && ids [1] == null) {
1942         error ("external identifer missing", nname, null);
1943     }
1944 
1945     // Register the notation.
1946     setNotation (nname, ids [0], ids [1]);
1947 
1948     skipWhitespace ();
1949     require ('>');
1950     }
1951 
1952 
1953     /***
1954      * Parse character data.
1955      * <pre>
1956      * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
1957      * </pre>
1958      */
1959     private void parseCharData ()
1960     throws Exception
1961     {
1962     char c;
1963 
1964     // Start with a little cheat -- in most
1965     // cases, the entire sequence of
1966     // character data will already be in
1967     // the readBuffer; if not, fall through to
1968     // the normal approach.
1969     if (USE_CHEATS) {
1970         int lineAugment = 0;
1971         int columnAugment = 0;
1972 
1973 loop:
1974         for (int i = readBufferPos; i < readBufferLength; i++) {
1975         switch (c = readBuffer [i]) {
1976         case '\n':
1977             lineAugment++;
1978             columnAugment = 0;
1979             break;
1980         case '&':
1981         case '<':
1982             int start = readBufferPos;
1983             columnAugment++;
1984             readBufferPos = i;
1985             if (lineAugment > 0) {
1986             line += lineAugment;
1987             column = columnAugment;
1988             } else {
1989             column += columnAugment;
1990             }
1991             dataBufferAppend (readBuffer, start, i - start);
1992             return;
1993         case ']':
1994             // XXX missing two end-of-buffer cases
1995             if ((i + 2) < readBufferLength) {
1996             if (readBuffer [i + 1] == ']'
1997                 && readBuffer [i + 2] == '>') {
1998                 error ("character data may not contain ']]>'");
1999             }
2000             }
2001             columnAugment++;
2002             break;
2003         default:
2004             if (c < 0x0020 || c > 0xFFFD)
2005             error ("illegal XML character U+"
2006                 + Integer.toHexString (c));
2007             // FALLTHROUGH
2008         case '\r':
2009         case '\t':
2010             columnAugment++;
2011         }
2012         }
2013     }
2014 
2015     // OK, the cheat didn't work; start over
2016     // and do it by the book.
2017     while (true) {
2018         c = readCh ();
2019         switch (c) {
2020         case '<':
2021         case '&':
2022         unread (c);
2023         return;
2024         // XXX "]]>" precluded ...
2025         default:
2026         dataBufferAppend (c);
2027         break;
2028         }
2029     }
2030     }
2031 
2032 
2033     //////////////////////////////////////////////////////////////////////
2034     // High-level reading and scanning methods.
2035     //////////////////////////////////////////////////////////////////////
2036 
2037     /***
2038      * Require whitespace characters.
2039      */
2040     private void requireWhitespace ()
2041     throws SAXException, IOException
2042     {
2043     char c = readCh ();
2044     if (isWhitespace (c)) {
2045         skipWhitespace ();
2046     } else {
2047         error ("whitespace required", c, null);
2048     }
2049     }
2050 
2051 
2052     /***
2053      * Parse whitespace characters, and leave them in the data buffer.
2054      */
2055     private void parseWhitespace ()
2056     throws Exception
2057     {
2058     char c = readCh ();
2059     while (isWhitespace (c)) {
2060         dataBufferAppend (c);
2061         c = readCh ();
2062     }
2063     unread (c);
2064     }
2065 
2066 
2067     /***
2068      * Skip whitespace characters.
2069      * <pre>
2070      * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2071      * </pre>
2072      */
2073     private void skipWhitespace ()
2074     throws SAXException, IOException
2075     {
2076     // Start with a little cheat.  Most of
2077     // the time, the white space will fall
2078     // within the current read buffer; if
2079     // not, then fall through.
2080     if (USE_CHEATS) {
2081         int lineAugment = 0;
2082         int columnAugment = 0;
2083 
2084 loop:
2085         for (int i = readBufferPos; i < readBufferLength; i++) {
2086         switch (readBuffer [i]) {
2087         case ' ':
2088         case '\t':
2089         case '\r':
2090             columnAugment++;
2091             break;
2092         case '\n':
2093             lineAugment++;
2094             columnAugment = 0;
2095             break;
2096         case '%':
2097             if (expandPE)
2098             break loop;
2099             // else fall through...
2100         default:
2101             readBufferPos = i;
2102             if (lineAugment > 0) {
2103             line += lineAugment;
2104             column = columnAugment;
2105             } else {
2106             column += columnAugment;
2107             }
2108             return;
2109         }
2110         }
2111     }
2112 
2113     // OK, do it by the book.
2114     char c = readCh ();
2115     while (isWhitespace (c)) {
2116         c = readCh ();
2117     }
2118     unread (c);
2119     }
2120 
2121 
2122     /***
2123      * Read a name or (when parsing an enumeration) name token.
2124      * <pre>
2125      * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2126      * [7] Nmtoken ::= (NameChar)+
2127      * </pre>
2128      */
2129     private String readNmtoken (boolean isName)
2130     throws SAXException, IOException
2131     {
2132     char c;
2133 
2134     if (USE_CHEATS) {
2135 loop:
2136         for (int i = readBufferPos; i < readBufferLength; i++) {
2137         c = readBuffer [i];
2138         switch (c) {
2139           case '%':
2140             if (expandPE)
2141             break loop;
2142             // else fall through...
2143 
2144             // What may legitimately come AFTER a name/nmtoken?
2145           case '<': case '>': case '&':
2146           case ',': case '|': case '*': case '+': case '?':
2147           case ')':
2148           case '=':
2149           case '\'': case '"':
2150           case '[':
2151           case ' ': case '\t': case '\r': case '\n':
2152           case ';':
2153           case '/':
2154             int start = readBufferPos;
2155             if (i == start)
2156             error ("name expected", readBuffer [i], null);
2157             readBufferPos = i;
2158             return intern (readBuffer, start, i - start);
2159 
2160           default:
2161             // punt on exact tests from Appendix A; approximate
2162             // them using the Unicode ID start/part rules
2163             if (i == readBufferPos && isName) {
2164             if (!Character.isUnicodeIdentifierStart (c)
2165                 && c != ':' && c != '_')
2166                 error ("Not a name start character, U+"
2167                   + Integer.toHexString (c));
2168             } else if (!Character.isUnicodeIdentifierPart (c)
2169                 && c != '-' && c != ':' && c != '_' && c != '.'
2170                 && !isExtender (c))
2171             error ("Not a name character, U+"
2172                 + Integer.toHexString (c));
2173         }
2174         }
2175     }
2176 
2177     nameBufferPos = 0;
2178 
2179     // Read the first character.
2180 loop:
2181     while (true) {
2182         c = readCh ();
2183         switch (c) {
2184         case '%':
2185         case '<': case '>': case '&':
2186         case ',': case '|': case '*': case '+': case '?':
2187         case ')':
2188         case '=':
2189         case '\'': case '"':
2190         case '[':
2191         case ' ': case '\t': case '\n': case '\r':
2192         case ';':
2193         case '/':
2194         unread (c);
2195         if (nameBufferPos == 0) {
2196             error ("name expected");
2197         }
2198         // punt on exact tests from Appendix A, but approximate them
2199         if (isName
2200             && !Character.isUnicodeIdentifierStart (
2201                 nameBuffer [0])
2202             && ":_".indexOf (nameBuffer [0]) == -1)
2203             error ("Not a name start character, U+"
2204                   + Integer.toHexString (nameBuffer [0]));
2205         String s = intern (nameBuffer, 0, nameBufferPos);
2206         nameBufferPos = 0;
2207         return s;
2208         default:
2209         // punt on exact tests from Appendix A, but approximate them
2210 
2211         if ((nameBufferPos != 0 || !isName)
2212             && !Character.isUnicodeIdentifierPart (c)
2213             && ":-_.".indexOf (c) == -1
2214             && !isExtender (c))
2215             error ("Not a name character, U+"
2216                 + Integer.toHexString (c));
2217         if (nameBufferPos >= nameBuffer.length)
2218             nameBuffer =
2219             (char[]) extendArray (nameBuffer,
2220                     nameBuffer.length, nameBufferPos);
2221         nameBuffer [nameBufferPos++] = c;
2222         }
2223     }
2224     }
2225 
2226     private static boolean isExtender (char c)
2227     {
2228     // [88] Extender ::= ...
2229     return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
2230            || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
2231            || (c >= 0x3031 && c <= 0x3035)
2232            || (c >= 0x309d && c <= 0x309e)
2233            || (c >= 0x30fc && c <= 0x30fe);
2234     }
2235 
2236 
2237     /***
2238      * Read a literal.  With matching single or double quotes as
2239      * delimiters (and not embedded!) this is used to parse:
2240      * <pre>
2241      *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
2242      *  [10] AttValue ::= ... ([^<&] | Reference)* ...
2243      *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
2244      *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
2245      * </pre>
2246      * as well as the quoted strings in XML and text declarations
2247      * (for version, encoding, and standalone) which have their
2248      * own constraints.
2249      */
2250     private String readLiteral (int flags)
2251     throws SAXException, IOException
2252     {
2253     char    delim, c;
2254     int startLine = line;
2255     boolean saved = expandPE;
2256 
2257     // Find the first delimiter.
2258     delim = readCh ();
2259     if (delim != '"' && delim != '\'' && delim != (char) 0) {
2260         error ("expected '\"' or \"'\"", delim, null);
2261         return null;
2262     }
2263     inLiteral = true;
2264     if ((flags & LIT_DISABLE_PE) != 0)
2265         expandPE = false;
2266 
2267     // Each level of input source has its own buffer; remember
2268     // ours, so we won't read the ending delimiter from any
2269     // other input source, regardless of entity processing.
2270     char ourBuf [] = readBuffer;
2271 
2272     // Read the literal.
2273     try {
2274         c = readCh ();
2275 loop:
2276         while (! (c == delim && readBuffer == ourBuf)) {
2277         switch (c) {
2278             // Can't escape this normalization for attributes
2279         case '\n':
2280         case '\r':
2281         case '\t':
2282             if ((flags & LIT_ATTRIBUTE) != 0)
2283             c = ' ';
2284             break;
2285         case '&':
2286             c = readCh ();
2287             // Char refs are expanded immediately, except for
2288             // all the cases where it's deferred.
2289             if (c == '#') {
2290             if ((flags & LIT_DISABLE_CREF) != 0) {
2291                 dataBufferAppend ('&');
2292                 dataBufferAppend ('#');
2293                 continue;
2294             }
2295             parseCharRef ();
2296 
2297             // It looks like an entity ref ...
2298             } else {
2299             unread (c);
2300             // Expand it?
2301             if ((flags & LIT_ENTITY_REF) > 0) {
2302                 parseEntityRef (false);
2303 
2304             // Is it just data?
2305             } else if ((flags & LIT_DISABLE_EREF) != 0) {
2306                 dataBufferAppend ('&');
2307 
2308             // OK, it will be an entity ref -- expanded later.
2309             } else {
2310                 String name = readNmtoken (true);
2311                 require (';');
2312                 if ((flags & LIT_ENTITY_CHECK) != 0
2313                     && getEntityType (name) ==
2314                         ENTITY_UNDECLARED) {
2315                 error ("General entity '" + name
2316                     + "' must be declared before use");
2317                 }
2318                 dataBufferAppend ('&');
2319                 dataBufferAppend (name);
2320                 dataBufferAppend (';');
2321             }
2322             }
2323             c = readCh ();
2324             continue loop;
2325 
2326         case '<':
2327             // and why?  Perhaps so "&foo;" expands the same
2328             // inside and outside an attribute?
2329             if ((flags & LIT_ATTRIBUTE) != 0)
2330             error ("attribute values may not contain '<'");
2331             break;
2332 
2333         // We don't worry about case '%' and PE refs, readCh does.
2334 
2335         default:
2336             break;
2337         }
2338         dataBufferAppend (c);
2339         c = readCh ();
2340         }
2341     } catch (EOFException e) {
2342         error ("end of input while looking for delimiter (started on line "
2343            + startLine + ')', null, new Character (delim).toString ());
2344     }
2345     inLiteral = false;
2346     expandPE = saved;
2347 
2348     // Normalise whitespace if necessary.
2349     if ((flags & LIT_NORMALIZE) > 0) {
2350         dataBufferNormalize ();
2351     }
2352 
2353     // Return the value.
2354     return dataBufferToString ();
2355     }
2356 
2357 
2358     /***
2359      * Try reading external identifiers.
2360      * A system identifier is not required for notations.
2361      * @param inNotation Are we in a notation?
2362      * @return A two-member String array containing the identifiers.
2363      */
2364     private String[] readExternalIds (boolean inNotation)
2365     throws Exception
2366     {
2367     char    c;
2368     String  ids[] = new String [2];
2369     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
2370 
2371     if (tryRead ("PUBLIC")) {
2372         requireWhitespace ();
2373         ids [0] = readLiteral (LIT_NORMALIZE | flags);
2374         if (inNotation) {
2375         skipWhitespace ();
2376         c = readCh ();
2377         unread (c);
2378         if (c == '"' || c == '\'') {
2379             ids [1] = readLiteral (flags);
2380         }
2381         } else {
2382         requireWhitespace ();
2383         ids [1] = readLiteral (flags);
2384         }
2385 
2386         for (int i = 0; i < ids [0].length (); i++) {
2387         c = ids [0].charAt (i);
2388         if (c >= 'a' && c <= 'z')
2389             continue;
2390         if (c >= 'A' && c <= 'Z')
2391             continue;
2392         if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf (c) != -1)
2393             continue;
2394         error ("illegal PUBLIC id character U+"
2395             + Integer.toHexString (c));
2396         }
2397     } else if (tryRead ("SYSTEM")) {
2398         requireWhitespace ();
2399         ids [1] = readLiteral (flags);
2400     }
2401 
2402     // XXX should normalize system IDs as follows:
2403     // - Convert to UTF-8
2404     // - Map reserved and non-ASCII characters to %HH
2405 
2406     return ids;
2407     }
2408 
2409 
2410     /***
2411      * Test if a character is whitespace.
2412      * <pre>
2413      * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2414      * </pre>
2415      * @param c The character to test.
2416      * @return true if the character is whitespace.
2417      */
2418     private final boolean isWhitespace (char c)
2419     {
2420     if (c > 0x20)
2421         return false;
2422     if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
2423         return true;
2424     return false;   // illegal ...
2425     }
2426 
2427 
2428     //////////////////////////////////////////////////////////////////////
2429     // Utility routines.
2430     //////////////////////////////////////////////////////////////////////
2431 
2432 
2433     /***
2434      * Add a character to the data buffer.
2435      */
2436     private void dataBufferAppend (char c)
2437     {
2438     // Expand buffer if necessary.
2439     if (dataBufferPos >= dataBuffer.length)
2440         dataBuffer =
2441         (char[]) extendArray (dataBuffer,
2442             dataBuffer.length, dataBufferPos);
2443     dataBuffer [dataBufferPos++] = c;
2444     }
2445 
2446 
2447     /***
2448      * Add a string to the data buffer.
2449      */
2450     private void dataBufferAppend (String s)
2451     {
2452     dataBufferAppend (s.toCharArray (), 0, s.length ());
2453     }
2454 
2455 
2456     /***
2457      * Append (part of) a character array to the data buffer.
2458      */
2459     private void dataBufferAppend (char ch[], int start, int length)
2460     {
2461     dataBuffer = (char[])
2462         extendArray (dataBuffer, dataBuffer.length,
2463                     dataBufferPos + length);
2464 
2465     System.arraycopy (ch, start, dataBuffer, dataBufferPos, length);
2466     dataBufferPos += length;
2467     }
2468 
2469 
2470     /***
2471      * Normalise whitespace in the data buffer.
2472      */
2473     private void dataBufferNormalize ()
2474     {
2475     int i = 0;
2476     int j = 0;
2477     int end = dataBufferPos;
2478 
2479     // Skip whitespace at the start.
2480     while (j < end && isWhitespace (dataBuffer [j])) {
2481         j++;
2482     }
2483 
2484     // Skip whitespace at the end.
2485     while (end > j && isWhitespace (dataBuffer [end - 1])) {
2486         end --;
2487     }
2488 
2489     // Start copying to the left.
2490     while (j < end) {
2491 
2492         char c = dataBuffer [j++];
2493 
2494         // Normalise all other whitespace to
2495         // a single space.
2496         if (isWhitespace (c)) {
2497         while (j < end && isWhitespace (dataBuffer [j++])) {}
2498 
2499         dataBuffer [i++] = ' ';
2500         dataBuffer [i++] = dataBuffer [j - 1];
2501         } else {
2502         dataBuffer [i++] = c;
2503         }
2504     }
2505 
2506     // The new length is <= the old one.
2507     dataBufferPos = i;
2508     }
2509 
2510 
2511     /***
2512      * Convert the data buffer to a string.
2513      */
2514     private String dataBufferToString ()
2515     {
2516     String s = new String (dataBuffer, 0, dataBufferPos);
2517     dataBufferPos = 0;
2518     return s;
2519     }
2520 
2521 
2522     /***
2523      * Flush the contents of the data buffer to the handler, as
2524      * appropriate, and reset the buffer for new input.
2525      */
2526     private void dataBufferFlush ()
2527     throws SAXException
2528     {
2529     if (currentElementContent == CONTENT_ELEMENTS
2530         && dataBufferPos > 0
2531         && !inCDATA
2532         ) {
2533         // We can't just trust the buffer to be whitespace, there
2534         // are cases when it isn't
2535         for (int i = 0; i < dataBufferPos; i++) {
2536         if (!isWhitespace (dataBuffer [i])) {
2537             handler.charData (dataBuffer, 0, dataBufferPos);
2538             dataBufferPos = 0;
2539         }
2540         }
2541         if (dataBufferPos > 0) {
2542         handler.ignorableWhitespace (dataBuffer, 0, dataBufferPos);
2543         dataBufferPos = 0;
2544         }
2545     } else if (dataBufferPos > 0) {
2546         handler.charData (dataBuffer, 0, dataBufferPos);
2547         dataBufferPos = 0;
2548     }
2549     }
2550 
2551 
2552     /***
2553      * Require a string to appear, or throw an exception.
2554      * <p><em>Precondition:</em> Entity expansion is not required.
2555      * <p><em>Precondition:</em> data buffer has no characters that
2556      * will get sent to the application.
2557      */
2558     private void require (String delim)
2559     throws SAXException, IOException
2560     {
2561     int length = delim.length ();
2562     char    ch [];
2563         
2564     if (length < dataBuffer.length) {
2565         ch = dataBuffer;
2566         delim.getChars (0, length, ch, 0);
2567     } else
2568         ch = delim.toCharArray ();
2569 
2570     if (USE_CHEATS
2571         && length <= (readBufferLength - readBufferPos)) {
2572         int offset = readBufferPos;
2573 
2574         for (int i = 0; i < length; i++, offset++)
2575         if (ch [i] != readBuffer [offset])
2576             error ("required string", null, delim);
2577         readBufferPos = offset;
2578         
2579     } else {
2580         for (int i = 0; i < length; i++)
2581         require (ch [i]);
2582     }
2583     }
2584 
2585 
2586     /***
2587      * Require a character to appear, or throw an exception.
2588      */
2589     private void require (char delim)
2590     throws SAXException, IOException
2591     {
2592     char c = readCh ();
2593 
2594     if (c != delim) {
2595         error ("required character", c, new Character (delim).toString ());
2596     }
2597     }
2598 
2599 
2600     /***
2601      * Create an interned string from a character array.
2602      * &AElig;lfred uses this method to create an interned version
2603      * of all names and name tokens, so that it can test equality
2604      * with <code>==</code> instead of <code>String.equals ()</code>.
2605      *
2606      * <p>This is much more efficient than constructing a non-interned
2607      * string first, and then interning it.
2608      *
2609      * @param ch an array of characters for building the string.
2610      * @param start the starting position in the array.
2611      * @param length the number of characters to place in the string.
2612      * @return an interned string.
2613      * @see #intern (String)
2614      * @see java.lang.String#intern
2615      */
2616     public String intern (char ch[], int start, int length)
2617     {
2618     int index = 0;
2619     int hash = 0;
2620     Object  bucket [];
2621 
2622     // Generate a hash code.
2623     for (int i = start; i < start + length; i++)
2624         hash = 31 * hash + ch [i];
2625     hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
2626 
2627     // Get the bucket -- consists of {array,String} pairs
2628     if ((bucket = symbolTable [hash]) == null) {
2629         // first string in this bucket
2630         bucket = new Object [8];
2631 
2632     // Search for a matching tuple, and
2633     // return the string if we find one.
2634     } else {
2635         while (index < bucket.length) {
2636         char chFound [] = (char []) bucket [index];
2637 
2638         // Stop when we hit a null index.
2639         if (chFound == null)
2640             break;
2641 
2642         // If they're the same length, check for a match.
2643         if (chFound.length == length) {
2644             for (int i = 0; i < chFound.length; i++) {
2645             // continue search on failure
2646             if (ch [start + i] != chFound [i]) {
2647                 break;
2648             } else if (i == length - 1) {
2649                 // That's it, we have a match!
2650                 return (String) bucket [index + 1];
2651             }
2652             }
2653         }
2654         index += 2;
2655         }
2656         // Not found -- we'll have to add it.
2657 
2658         // Do we have to grow the bucket?
2659         bucket = (Object []) extendArray (bucket, bucket.length, index);
2660     }
2661     symbolTable [hash] = bucket;
2662 
2663     // OK, add it to the end of the bucket -- "local" interning.
2664     // Intern "globally" to let applications share interning benefits.
2665     String s = new String (ch, start, length).intern ();
2666     bucket [index] = s.toCharArray ();
2667     bucket [index + 1] = s;
2668     return s;
2669     }
2670 
2671 
2672     /***
2673      * Ensure the capacity of an array, allocating a new one if
2674      * necessary.  Usually called only a handful of times.
2675      */
2676     private Object extendArray (Object array, int currentSize, int requiredSize)
2677     {
2678     if (requiredSize < currentSize) {
2679         return array;
2680     } else {
2681         Object newArray = null;
2682         int newSize = currentSize * 2;
2683 
2684         if (newSize <= requiredSize)
2685         newSize = requiredSize + 1;
2686 
2687         if (array instanceof char[])
2688         newArray = new char [newSize];
2689         else if (array instanceof Object[])
2690         newArray = new Object [newSize];
2691         else
2692         throw new RuntimeException ();
2693 
2694         System.arraycopy (array, 0, newArray, 0, currentSize);
2695         return newArray;
2696     }
2697     }
2698 
2699 
2700     //////////////////////////////////////////////////////////////////////
2701     // XML query routines.
2702     //////////////////////////////////////////////////////////////////////
2703 
2704 
2705     //
2706     // Elements
2707     //
2708 
2709     /***
2710      * Get the declared elements for an XML document.
2711      * <p>The results will be valid only after the DTD (if any) has been
2712      * parsed.
2713      * @return An enumeration of all element types declared for this
2714      *   document (as Strings).
2715      * @see #getElementContentType
2716      * @see #getElementContentModel
2717      */
2718     public Iterator declaredElements ()
2719     {
2720     return elementInfo.keySet().iterator();
2721     }
2722 
2723 
2724     /***
2725      * Look up the content type of an element.
2726      * @param element element info vector
2727      * @param defaultType value for null vector
2728      * @return An integer constant representing the content type.
2729      * @see #CONTENT_UNDECLARED
2730      * @see #CONTENT_ANY
2731      * @see #CONTENT_EMPTY
2732      * @see #CONTENT_MIXED
2733      * @see #CONTENT_ELEMENTS
2734      */
2735     private int getContentType (Object element [], int defaultType)
2736     {
2737     if (element == null)
2738         return defaultType;
2739     else
2740         return ((Integer) element [0]).intValue ();
2741     }
2742 
2743 
2744     /***
2745      * Look up the content type of an element.
2746      * @param name The element type name.
2747      * @return An integer constant representing the content type.
2748      * @see #getElementContentModel
2749      * @see #CONTENT_UNDECLARED
2750      * @see #CONTENT_ANY
2751      * @see #CONTENT_EMPTY
2752      * @see #CONTENT_MIXED
2753      * @see #CONTENT_ELEMENTS
2754      */
2755     public int getElementContentType (String name)
2756     {
2757     Object element [] = (Object []) elementInfo.get (name);
2758     return getContentType (element, CONTENT_UNDECLARED);
2759     }
2760 
2761 
2762     /***
2763      * Look up the content model of an element.
2764      * <p>The result will always be null unless the content type is
2765      * CONTENT_ELEMENTS or CONTENT_MIXED.
2766      * @param name The element type name.
2767      * @return The normalised content model, as a string.
2768      * @see #getElementContentType
2769      */
2770     public String getElementContentModel (String name)
2771     {
2772     Object element[] = (Object[]) elementInfo.get (name);
2773     if (element == null) {
2774         return null;
2775     } else {
2776         return (String) element [1];
2777     }
2778     }
2779 
2780 
2781     /***
2782      * Register an element.
2783      * Array format:
2784      *  element type
2785      *  attribute hash table
2786      */
2787     private void setElement (String name, int contentType,
2788               String contentModel, HashMap attributes)
2789     throws Exception
2790     {
2791     Object element[];
2792 
2793     // Try looking up the element
2794     element = (Object[]) elementInfo.get (name);
2795 
2796     // Make a new one if necessary.
2797     if (element == null) {
2798         element = new Object [3];
2799         element [0] = new Integer (CONTENT_UNDECLARED);
2800         element [1] = null;
2801         element [2] = null;
2802     } else if (contentType != CONTENT_UNDECLARED
2803         && ((Integer) element [0]).intValue () != CONTENT_UNDECLARED
2804         ) {
2805         // warn ("multiple declarations for element type", name, null);
2806         return;
2807     }
2808 
2809     // Insert the content type, if any.
2810     if (contentType != CONTENT_UNDECLARED) {
2811         element [0] = new Integer (contentType);
2812     }
2813 
2814     // Insert the content model, if any.
2815     if (contentModel != null) {
2816         element [1] = contentModel;
2817     }
2818 
2819     // Insert the attributes, if any.
2820     if (attributes != null) {
2821         element [2] = attributes;
2822     }
2823 
2824     // Save the element info.
2825     elementInfo.put (name, element);
2826     }
2827 
2828 
2829     /***
2830      * Look up the attribute hash table for an element.
2831      * The hash table is the second item in the element array.
2832      */
2833     private HashMap getElementAttributes (String name)
2834     {
2835     Object element[] = (Object[]) elementInfo.get (name);
2836     if (element == null) {
2837         return null;
2838     } else {
2839         return (HashMap) element [2];
2840     }
2841     }
2842 
2843 
2844 
2845     //
2846     // Attributes
2847     //
2848 
2849     /***
2850      * Get the declared attributes for an element type.
2851      * @param elname The name of the element type.
2852      * @return An Iterator of all the attributes declared for
2853      *   a specific element type.  The results will be valid only
2854      *   after the DTD (if any) has been parsed.
2855      * @see #getAttributeType
2856      * @see #getAttributeIterator
2857      * @see #getAttributeDefaultValueType
2858      * @see #getAttributeDefaultValue
2859      * @see #getAttributeExpandedValue
2860      */
2861     private Iterator declaredAttributes (Object element [])
2862     {
2863     HashMap attlist;
2864 
2865     if (element == null)
2866         return null;
2867     if ((attlist = (HashMap) element [2]) == null)
2868         return null;
2869     return attlist.keySet().iterator();
2870     }
2871 
2872     /***
2873      * Get the declared attributes for an element type.
2874      * @param elname The name of the element type.
2875      * @return An Iterator of all the attributes declared for
2876      *   a specific element type.  The results will be valid only
2877      *   after the DTD (if any) has been parsed.
2878      * @see #getAttributeType
2879      * @see #getAttributeIterator
2880      * @see #getAttributeDefaultValueType
2881      * @see #getAttributeDefaultValue
2882      * @see #getAttributeExpandedValue
2883      */
2884     public Iterator declaredAttributes (String elname)
2885     {
2886     return declaredAttributes ((Object []) elementInfo.get (elname));
2887     }
2888 
2889 
2890     /***
2891      * Retrieve the declared type of an attribute.
2892      * @param name The name of the associated element.
2893      * @param aname The name of the attribute.
2894      * @return An integer constant representing the attribute type.
2895      * @see #ATTRIBUTE_UNDECLARED
2896      * @see #ATTRIBUTE_CDATA
2897      * @see #ATTRIBUTE_ID
2898      * @see #ATTRIBUTE_IDREF
2899      * @see #ATTRIBUTE_IDREFS
2900      * @see #ATTRIBUTE_ENTITY
2901      * @see #ATTRIBUTE_ENTITIES
2902      * @see #ATTRIBUTE_NMTOKEN
2903      * @see #ATTRIBUTE_NMTOKENS
2904      * @see #ATTRIBUTE_ENUMERATED
2905      * @see #ATTRIBUTE_NOTATION
2906      */
2907     public int getAttributeType (String name, String aname)
2908     {
2909     Object attribute[] = getAttribute (name, aname);
2910     if (attribute == null) {
2911         return ATTRIBUTE_UNDECLARED;
2912     } else {
2913         return ((Integer) attribute [0]).intValue ();
2914     }
2915     }
2916 
2917 
2918     /***
2919      * Retrieve the allowed values for an enumerated attribute type.
2920      * @param name The name of the associated element.
2921      * @param aname The name of the attribute.
2922      * @return A string containing the token list.
2923      * @see #ATTRIBUTE_ENUMERATED
2924      * @see #ATTRIBUTE_NOTATION
2925      */
2926     public String getAttributeIterator (String name, String aname)
2927     {
2928     Object attribute[] = getAttribute (name, aname);
2929     if (attribute == null) {
2930         return null;
2931     } else {
2932         return (String) attribute [3];
2933     }
2934     }
2935 
2936 
2937     /***
2938      * Retrieve the default value of a declared attribute.
2939      * @param name The name of the associated element.
2940      * @param aname The name of the attribute.
2941      * @return The default value, or null if the attribute was
2942      *   #IMPLIED or simply undeclared and unspecified.
2943      * @see #getAttributeExpandedValue
2944      */
2945     public String getAttributeDefaultValue (String name, String aname)
2946     {
2947     Object attribute[] = getAttribute (name, aname);
2948     if (attribute == null) {
2949         return null;
2950     } else {
2951         return (String) attribute [1];
2952     }
2953     }
2954 
2955 
2956     /***
2957      * Retrieve the expanded value of a declared attribute.
2958      * <p>General entities will be expanded (once).
2959      * @param name The name of the associated element.
2960      * @param aname The name of the attribute.
2961      * @return The expanded default value, or null if the attribute was
2962      *   #IMPLIED or simply undeclared
2963      * @see #getAttributeDefaultValue
2964      */
2965     public String getAttributeExpandedValue (String name, String aname)
2966     throws Exception
2967     {
2968     Object attribute[] = getAttribute (name, aname);
2969 
2970     if (attribute == null) {
2971         return null;
2972     } else if (attribute [4] == null && attribute [1] != null) {
2973         // we MUST use the same buf for both quotes else the literal
2974         // can't be properly terminated
2975         char buf [] = new char [1];
2976         int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
2977         int type = getAttributeType (name, aname);
2978 
2979         if (type != ATTRIBUTE_CDATA && type != ATTRIBUTE_UNDECLARED)
2980         flags |= LIT_NORMALIZE;
2981         buf [0] = '"';
2982         pushCharArray (null, buf, 0, 1);
2983         pushString (null, (String) attribute [1]);
2984         pushCharArray (null, buf, 0, 1);
2985         attribute [4] = readLiteral (flags);
2986     }
2987     return (String) attribute [4];
2988     }
2989 
2990 
2991     /***
2992      * Retrieve the default value type of a declared attribute.
2993      * @see #ATTRIBUTE_DEFAULT_SPECIFIED
2994      * @see #ATTRIBUTE_DEFAULT_IMPLIED
2995      * @see #ATTRIBUTE_DEFAULT_REQUIRED
2996      * @see #ATTRIBUTE_DEFAULT_FIXED
2997      */
2998     public int getAttributeDefaultValueType (String name, String aname)
2999     {
3000     Object attribute[] = getAttribute (name, aname);
3001     if (attribute == null) {
3002         return ATTRIBUTE_DEFAULT_UNDECLARED;
3003     } else {
3004         return ((Integer) attribute [2]).intValue ();
3005     }
3006     }
3007 
3008 
3009     /***
3010      * Register an attribute declaration for later retrieval.
3011      * Format:
3012      * - String type
3013      * - String default value
3014      * - int value type
3015      */
3016     private void setAttribute (String elName, String name, int type,
3017             String enumeration,
3018             String value, int valueType)
3019     throws Exception
3020     {
3021     HashMap attlist;
3022     Object attribute[];
3023 
3024     // Create a new hashtable if necessary.
3025     attlist = getElementAttributes (elName);
3026     if (attlist == null) {
3027         attlist = new HashMap ();
3028     }
3029 
3030     // ignore multiple attribute declarations!
3031     if (attlist.get (name) != null) {
3032         return;
3033     } else {
3034         attribute = new Object [5];
3035         attribute [0] = new Integer (type);
3036         attribute [1] = value;
3037         attribute [2] = new Integer (valueType);
3038         attribute [3] = enumeration;
3039         attribute [4] = null;
3040         attlist.put (name, attribute);
3041 
3042         // Use CONTENT_UNDECLARED to avoid overwriting
3043         // existing element declaration.
3044         setElement (elName, CONTENT_UNDECLARED, null, attlist);
3045     }
3046     }
3047 
3048 
3049     /***
3050      * Retrieve the three-member array representing an
3051      * attribute declaration.
3052      */
3053     private Object[] getAttribute (String elName, String name)
3054     {
3055     HashMap attlist;
3056     Object attribute[];
3057 
3058     attlist = getElementAttributes (elName);
3059     if (attlist == null) {
3060         return null;
3061     }
3062 
3063     attribute = (Object[]) attlist.get (name);
3064     return attribute;
3065     }
3066 
3067 
3068     //
3069     // Entities
3070     //
3071 
3072     /***
3073      * Get declared entities.
3074      * @return An Iterator of all the entities declared for
3075      *   this XML document.  The results will be valid only
3076      *   after the DTD (if any) has been parsed.
3077      * @see #getEntityType
3078      * @see #getEntityPublicId
3079      * @see #getEntitySystemId
3080      * @see #getEntityValue
3081      * @see #getEntityNotationName
3082      */
3083     public Iterator declaredEntities ()
3084     {
3085     return entityInfo.keySet().iterator();
3086     }
3087 
3088 
3089     /***
3090      * Find the type of an entity.
3091      * @returns An integer constant representing the entity type.
3092      * @see #ENTITY_UNDECLARED
3093      * @see #ENTITY_INTERNAL
3094      * @see #ENTITY_NDATA
3095      * @see #ENTITY_TEXT
3096      */
3097     public int getEntityType (String ename)
3098     {
3099     Object entity[] = (Object[]) entityInfo.get (ename);
3100     if (entity == null) {
3101         return ENTITY_UNDECLARED;
3102     } else {
3103         return ((Integer) entity [0]).intValue ();
3104     }
3105     }
3106 
3107 
3108     /***
3109      * Return an external entity's public identifier, if any.
3110      * @param ename The name of the external entity.
3111      * @return The entity's system identifier, or null if the
3112      *   entity was not declared, if it is not an
3113      *   external entity, or if no public identifier was
3114      *   provided.
3115      * @see #getEntityType
3116      */
3117     public String getEntityPublicId (String ename)
3118     {
3119     Object entity[] = (Object[]) entityInfo.get (ename);
3120     if (entity == null) {
3121         return null;
3122     } else {
3123         return (String) entity [1];
3124     }
3125     }
3126 
3127 
3128     /***
3129      * Return an external entity's system identifier.
3130      * @param ename The name of the external entity.
3131      * @return The entity's system identifier, or null if the
3132      *   entity was not declared, or if it is not an
3133      *   external entity.
3134      * @see #getEntityType
3135      */
3136     public String getEntitySystemId (String ename)
3137     {
3138     Object entity[] = (Object[]) entityInfo.get (ename);
3139     if (entity == null) {
3140         return null;
3141     } else {
3142         return (String) entity [2];
3143     }
3144     }
3145 
3146 
3147     /***
3148      * Return the value of an internal entity.
3149      * @param ename The name of the internal entity.
3150      * @return The entity's value, or null if the entity was
3151      *   not declared, or if it is not an internal entity.
3152      * @see #getEntityType
3153      */
3154     public String getEntityValue (String ename)
3155     {
3156     Object entity[] = (Object[]) entityInfo.get (ename);
3157     if (entity == null) {
3158         return null;
3159     } else {
3160         return (String) entity [3];
3161     }
3162     }
3163 
3164 
3165     /***
3166      * Get the notation name associated with an NDATA entity.
3167      * @param ename The NDATA entity name.
3168      * @return The associated notation name, or null if the
3169      *   entity was not declared, or if it is not an
3170      *   NDATA entity.
3171      * @see #getEntityType
3172      */
3173     public String getEntityNotationName (String eName)
3174     {
3175     Object entity[] = (Object[]) entityInfo.get (eName);
3176     if (entity == null) {
3177         return null;
3178     } else {
3179         return (String) entity [4];
3180     }
3181     }
3182 
3183 
3184     /***
3185      * Register an entity declaration for later retrieval.
3186      */
3187     private void setInternalEntity (String eName, String value)
3188     {
3189     setEntity (eName, ENTITY_INTERNAL, null, null, value, null);
3190     }
3191 
3192 
3193     /***
3194      * Register an external data entity.
3195      */
3196     private void setExternalDataEntity (String eName, String pubid,
3197                  String sysid, String nName)
3198     {
3199     setEntity (eName, ENTITY_NDATA, pubid, sysid, null, nName);
3200     }
3201 
3202 
3203     /***
3204      * Register an external text entity.
3205      */
3206     private void setExternalTextEntity (String eName,
3207             String pubid, String sysid)
3208     {
3209     setEntity (eName, ENTITY_TEXT, pubid, sysid, null, null);
3210     }
3211 
3212 
3213     /***
3214      * Register an entity declaration for later retrieval.
3215      */
3216     private void setEntity (String eName, int eClass,
3217              String pubid, String sysid,
3218              String value, String nName)
3219     {
3220     Object entity[];
3221 
3222     if (entityInfo.get (eName) == null) {
3223         entity = new Object [5];
3224         entity [0] = new Integer (eClass);
3225         entity [1] = pubid;
3226         entity [2] = sysid;
3227         entity [3] = value;
3228         entity [4] = nName;
3229 
3230         entityInfo.put (eName, entity);
3231     }
3232     }
3233 
3234 
3235     //
3236     // Notations.
3237     //
3238 
3239     /***
3240      * Get declared notations.
3241      * @return An Iterator of all the notations declared for
3242      *   this XML document.  The results will be valid only
3243      *   after the DTD (if any) has been parsed.
3244      * @see #getNotationPublicId
3245      * @see #getNotationSystemId
3246      */
3247     public Iterator declaredNotations ()
3248     {
3249     return notationInfo.keySet().iterator();
3250     }
3251 
3252 
3253     /***
3254      * Look up the public identifier for a notation.
3255      * You will normally use this method to look up a notation
3256      * that was provided as an attribute value or for an NDATA entity.
3257      * @param nname The name of the notation.
3258      * @return A string containing the public identifier, or null
3259      *   if none was provided or if no such notation was
3260      *   declared.
3261      * @see #getNotationSystemId
3262      */
3263     public String getNotationPublicId (String nname)
3264     {
3265     Object notation[] = (Object[]) notationInfo.get (nname);
3266     if (notation == null) {
3267         return null;
3268     } else {
3269         return (String) notation [0];
3270     }
3271     }
3272 
3273 
3274     /***
3275      * Look up the system identifier for a notation.
3276      * You will normally use this method to look up a notation
3277      * that was provided as an attribute value or for an NDATA entity.
3278      * @param nname The name of the notation.
3279      * @return A string containing the system identifier, or null
3280      *   if no such notation was declared.
3281      * @see #getNotationPublicId
3282      */
3283     public String getNotationSystemId (String nname)
3284     {
3285     Object notation[] = (Object[]) notationInfo.get (nname);
3286     if (notation == null) {
3287         return null;
3288     } else {
3289         return (String) notation [1];
3290     }
3291     }
3292 
3293 
3294     /***
3295      * Register a notation declaration for later retrieval.
3296      * Format:
3297      * - public id
3298      * - system id
3299      */
3300     private void setNotation (String nname, String pubid, String sysid)
3301     throws Exception
3302     {
3303     Object notation[];
3304 
3305     if (notationInfo.get (nname) == null) {
3306         notation = new Object [2];
3307         notation [0] = pubid;
3308         notation [1] = sysid;
3309         notationInfo.put (nname, notation);
3310     } else {
3311         // VC: Unique Notation Name
3312         // (it's not fatal)
3313     }
3314     }
3315 
3316 
3317     //
3318     // Location.
3319     //
3320 
3321 
3322     /***
3323      * Return the current line number.
3324      */
3325     public int getLineNumber ()
3326     {
3327     return line;
3328     }
3329 
3330 
3331     /***
3332      * Return the current column number.
3333      */
3334     public int getColumnNumber ()
3335     {
3336     return column;
3337     }
3338 
3339 
3340     //////////////////////////////////////////////////////////////////////
3341     // High-level I/O.
3342     //////////////////////////////////////////////////////////////////////
3343 
3344 
3345     /***
3346      * Read a single character from the readBuffer.
3347      * <p>The readDataChunk () method maintains the buffer.
3348      * <p>If we hit the end of an entity, try to pop the stack and
3349      * keep going.
3350      * <p> (This approach doesn't really enforce XML's rules about
3351      * entity boundaries, but this is not currently a validating
3352      * parser).
3353      * <p>This routine also attempts to keep track of the current
3354      * position in external entities, but it's not entirely accurate.
3355      * @return The next available input character.
3356      * @see #unread (char)
3357      * @see #unread (String)
3358      * @see #readDataChunk
3359      * @see #readBuffer
3360      * @see #line
3361      * @return The next character from the current input source.
3362      */
3363     private char readCh ()
3364     throws SAXException, IOException
3365     {
3366     char c;
3367 
3368     // As long as there's nothing in the
3369     // read buffer, try reading more data
3370     // (for an external entity) or popping
3371     // the entity stack (for either).
3372     while (readBufferPos >= readBufferLength) {
3373         switch (sourceType) {
3374         case INPUT_READER:
3375         case INPUT_EXTERNAL:
3376         case INPUT_STREAM:
3377         readDataChunk ();
3378         while (readBufferLength < 1) {
3379             popInput ();
3380             if (readBufferLength < 1) {
3381             readDataChunk ();
3382             }
3383         }
3384         break;
3385 
3386         default:
3387 
3388         popInput ();
3389         break;
3390         }
3391     }
3392 
3393     c = readBuffer [readBufferPos++];
3394 
3395     if (c == '\n') {
3396         line++;
3397         column = 0;
3398     } else {
3399         if (c == '<')
3400         /* favorite return to parseContent () .. NOP */ ;
3401         else if ((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
3402         error ("illegal XML character U+"
3403             + Integer.toHexString (c));
3404 
3405         // If we're in the DTD and in a context where PEs get expanded,
3406         // do so ... 1/14/2000 errata identify those contexts.  There
3407         // are also spots in the internal subset where PE refs are fatal
3408         // errors, hence yet another flag.
3409         else if (c == '%' && expandPE) {
3410         if (peIsError)
3411             error ("PE reference within decl in internal subset.");
3412         parsePEReference ();
3413         return readCh ();
3414         }
3415         column++;
3416     }
3417 
3418     return c;
3419     }
3420 
3421 
3422     /***
3423      * Push a single character back onto the current input stream.
3424      * <p>This method usually pushes the character back onto
3425      * the readBuffer, while the unread (String) method treats the
3426      * string as a new internal entity.
3427      * <p>I don't think that this would ever be called with 
3428      * readBufferPos = 0, because the methods always reads a character
3429      * before unreading it, but just in case, I've added a boundary
3430      * condition.
3431      * @param c The character to push back.
3432      * @see #readCh
3433      * @see #unread (String)
3434      * @see #unread (char[])
3435      * @see #readBuffer
3436      */
3437     private void unread (char c)
3438     throws SAXException
3439     {
3440     // Normal condition.
3441     if (c == '\n') {
3442         line--;
3443         column = -1;
3444     }
3445     if (readBufferPos > 0) {
3446         readBuffer [--readBufferPos] = c;
3447     } else {
3448         pushString (null, new Character (c).toString ());
3449     }
3450     }
3451 
3452 
3453     /***
3454      * Push a char array back onto the current input stream.
3455      * <p>NOTE: you must <em>never</em> push back characters that you
3456      * haven't actually read: use pushString () instead.
3457      * @see #readCh
3458      * @see #unread (char)
3459      * @see #unread (String)
3460      * @see #readBuffer
3461      * @see #pushString
3462      */
3463     private void unread (char ch[], int length)
3464     throws SAXException
3465     {
3466     for (int i = 0; i < length; i++) {
3467         if (ch [i] == '\n') {
3468         line--;
3469         column = -1;
3470         }
3471     }
3472     if (length < readBufferPos) {
3473         readBufferPos -= length;
3474     } else {
3475         pushCharArray (null, ch, 0, length);
3476         sourceType = INPUT_BUFFER;
3477     }
3478     }
3479 
3480 
3481     /***
3482      * Push a new external input source.
3483      * The source will be some kind of parsed entity, such as a PE
3484      * (including the external DTD subset) or content for the body.
3485      * <p>TODO: Right now, this method always attempts to autodetect
3486      * the encoding; in the future, it should allow the caller to 
3487      * request an encoding explicitly, and it should also look at the
3488      * headers with an HTTP connection.
3489      * @param url The java.net.URL object for the entity.
3490      * @see SAXDriver#resolveEntity
3491      * @see #pushString
3492      * @see #sourceType
3493      * @see #pushInput
3494      * @see #detectEncoding
3495      * @see #sourceType
3496      * @see #readBuffer
3497      */
3498     private void pushURL (
3499     String      ename,
3500     String      publicId,
3501     String      systemId,
3502     Reader      reader,
3503     InputStream stream,
3504     String      encoding
3505     ) throws SAXException, IOException
3506     {
3507     URL url;
3508     boolean ignoreEncoding = false;
3509 
3510     // Push the existing status.
3511     pushInput (ename);
3512 
3513     // Create a new read buffer.
3514     // (Note the four-character margin)
3515     readBuffer = new char [READ_BUFFER_MAX + 4];
3516     readBufferPos = 0;
3517     readBufferLength = 0;
3518     readBufferOverflow = -1;
3519     is = null;
3520     line = 1;
3521 
3522     currentByteCount = 0;
3523 
3524     // Make any system ID (URI/URL) absolute.  There's one case
3525     // where it may be null:  parser was invoked without providing
3526     // one, e.g. since the XML data came from a memory buffer.
3527 
3528     if (systemId != null && externalEntity != null) {
3529         systemId = new URL (externalEntity.getURL (), systemId).toString ();
3530     } else if (baseURI != null) {
3531         systemId = new URL (new URL (baseURI), systemId).toString ();
3532         // throws IOException if couldn't create new URL
3533     }
3534 
3535     // See if the application wants to
3536     // redirect the system ID and/or
3537     // supply its own character stream.
3538     if (reader == null && stream == null && systemId != null) {
3539         Object input = handler.resolveEntity (publicId, systemId);
3540         if (input != null) {
3541         if (input instanceof String) {
3542             systemId = (String) input;
3543         } else if (input instanceof InputStream) {
3544             stream = (InputStream) input;
3545         } else if (input instanceof Reader) {
3546             reader = (Reader) input;
3547         }
3548         }
3549     }
3550 
3551     // Start the entity.
3552     if (systemId != null) {
3553         handler.startExternalEntity (systemId);
3554     } else {
3555         handler.startExternalEntity ("[unidentified data stream]");
3556     }
3557 
3558     // If there's an explicit character stream, just
3559     // ignore encoding declarations.
3560     if (reader != null) {
3561         sourceType = INPUT_READER;
3562         this.reader = reader;
3563         tryEncodingDecl (true);
3564         return;
3565     }
3566     
3567     // Else we handle the conversion, and need to ensure
3568     // it's done right.
3569     if (stream != null) {
3570         sourceType = INPUT_STREAM;
3571         is = stream;
3572         url = null;
3573     } else {
3574         // We have to open our own stream to the URL.
3575 
3576         // Set the new status
3577         sourceType = INPUT_EXTERNAL;
3578         url = new URL (systemId);
3579 
3580         externalEntity = url.openConnection ();
3581         externalEntity.connect ();
3582         is = externalEntity.getInputStream ();
3583     }
3584 
3585     // If we get to here, there must be
3586     // an InputStream available.
3587     if (!is.markSupported ()) {
3588         is = new BufferedInputStream (is);
3589     }
3590 
3591     // Get any external encoding label.
3592     if (encoding == null && externalEntity != null) {
3593         // External labels can be untrustworthy; filesystems in
3594         // particular often have the wrong default for content
3595         // that wasn't locally originated.  Those we autodetect.
3596         if (!"file".equals (externalEntity.getURL ().getProtocol ())) {
3597         int temp;
3598 
3599         // application/xml;charset=something;otherAttr=...
3600         // ... with many variants on 'something'
3601         encoding = externalEntity.getContentType ();
3602 
3603         // MHK code (fix for Saxon 5.5.1/007): protect against encoding==null
3604         if (encoding==null) {
3605             temp = -1;
3606         } else {
3607             temp = encoding.indexOf ("charset");
3608         }
3609 
3610         // RFC 2376 sez MIME text defaults to ASCII, but since the
3611         // JDK will create a MIME type out of thin air, we always
3612         // autodetect when there's no explicit charset attribute.
3613         if (temp < 0)
3614             encoding = null;    // autodetect
3615         else {
3616             temp = encoding.indexOf ('=', temp + 7);
3617             encoding = encoding.substring (temp + 1);
3618             if ((temp = encoding.indexOf (';')) > 0)
3619             encoding = encoding.substring (0, temp);
3620 
3621             // attributes can have comment fields (RFC 822)
3622             if ((temp = encoding.indexOf ('(')) > 0)
3623             encoding = encoding.substring (0, temp);
3624             // ... and values may be quoted
3625             if ((temp = encoding.indexOf ('"')) > 0)
3626             encoding = encoding.substring (temp + 1,
3627                 encoding.indexOf ('"', temp + 2));
3628             encoding.trim ();
3629         }
3630         }
3631     }
3632 
3633     // if we got an external encoding label, use it ...
3634     if (encoding != null) {
3635         this.encoding = ENCODING_EXTERNAL;
3636         setupDecoding (encoding);
3637         ignoreEncoding = true;
3638     
3639     // ... else autodetect
3640     } else {
3641         detectEncoding ();
3642         ignoreEncoding = false;
3643     }
3644 
3645     // Read any XML or text declaration.
3646     tryEncodingDecl (ignoreEncoding);
3647     }
3648 
3649 
3650     /***
3651      * Check for an encoding declaration.  This is the second part of the
3652      * XML encoding autodetection algorithm, relying on detectEncoding to
3653      * get to the point that this part can read any encoding declaration
3654      * in the document (using only US-ASCII characters).
3655      *
3656      * <p> Because this part starts to fill parser buffers with this data,
3657      * it's tricky to to a reader so that Java's built-in decoders can be
3658      * used for the character encodings that aren't built in to this parser
3659      * (such as EUC-JP, KOI8-R, Big5, etc).
3660      *
3661      * @return any encoding in the declaration, uppercased; or null
3662      * @see detectEncoding
3663      */
3664     private String tryEncodingDecl (boolean ignoreEncoding)
3665     throws SAXException, IOException
3666     {
3667     // Read the XML/text declaration.
3668     if (tryRead ("<?xml")) {
3669         dataBufferFlush ();
3670         if (tryWhitespace ()) {
3671         if (inputStack.size () > 0) {
3672             return parseTextDecl (ignoreEncoding);
3673         } else {
3674             return parseXMLDecl (ignoreEncoding);
3675         }
3676         } else {
3677         unread ("xml".toCharArray (), 3);
3678         parsePI ();
3679         }
3680     }
3681     return null;
3682     }
3683 
3684 
3685     /***
3686      * Attempt to detect the encoding of an entity.
3687      * <p>The trick here (as suggested in the XML standard) is that
3688      * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 
3689      * <b>must</b> begin with an XML declaration or an encoding
3690      * declaration; we simply have to look for "&lt;?xml" in various
3691      * encodings.
3692      * <p>This method has no way to distinguish among 8-bit encodings.
3693      * Instead, it sets up for UTF-8, then (possibly) revises its assumption
3694      * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
3695      * should work, but most will be rejected later by setupDecoding ().
3696      * <p>I don't currently detect EBCDIC, since I'm concerned that it
3697      * could also be a valid UTF-8 sequence; I'll have to do more checking
3698      * later.
3699      * @see #tryEncoding (byte[], byte, byte, byte, byte)
3700      * @see #tryEncoding (byte[], byte, byte)
3701      * @see #setupDecoding
3702      * @see #read8bitEncodingDeclaration
3703      */
3704     private void detectEncoding ()
3705     throws SAXException, IOException
3706     {
3707     byte signature[] = new byte [4];
3708 
3709     // Read the first four bytes for
3710     // autodetection.
3711     is.mark (4);
3712     is.read (signature);
3713     is.reset ();
3714 
3715     //
3716     // FIRST:  four byte encodings (who uses these?)
3717     //
3718     if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3719               (byte) 0x00, (byte) 0x3c)) {
3720         // UCS-4 must begin with "<?xml"
3721         // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
3722         encoding = ENCODING_UCS_4_1234;
3723 
3724     } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3725                  (byte) 0x00, (byte) 0x00)) {
3726         // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
3727         encoding = ENCODING_UCS_4_4321;
3728 
3729     } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x00,
3730                  (byte) 0x3c, (byte) 0x00)) {
3731         // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
3732         encoding = ENCODING_UCS_4_2143;
3733 
3734     } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3735                  (byte) 0x00, (byte) 0x00)) {
3736         // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
3737         encoding = ENCODING_UCS_4_3412;
3738 
3739         // 00 00 fe ff UCS_4_1234 (with BOM)
3740         // ff fe 00 00 UCS_4_4321 (with BOM)
3741     }
3742 
3743     //
3744     // SECOND:  two byte encodings
3745     // note ... with 1/14/2000 errata the XML spec identifies some
3746     // more "broken UTF-16" autodetection cases, with no XML decl,
3747     // which we don't handle here (that's legal too).
3748     //
3749     else if (tryEncoding (signature, (byte) 0xfe, (byte) 0xff)) {
3750         // UCS-2 with a byte-order marker. (UTF-16)
3751         // 0xfe 0xff: UCS-2, big-endian (12)
3752         encoding = ENCODING_UCS_2_12;
3753         is.read (); is.read ();
3754 
3755     } else if (tryEncoding (signature, (byte) 0xff, (byte) 0xfe)) {
3756         // UCS-2 with a byte-order marker. (UTF-16)
3757         // 0xff 0xfe: UCS-2, little-endian (21)
3758         encoding = ENCODING_UCS_2_21;
3759         is.read (); is.read ();
3760 
3761     } else if (tryEncoding (signature, (byte) 0x00, (byte) 0x3c,
3762                  (byte) 0x00, (byte) 0x3f)) {
3763         // UTF-16-BE (otherwise, malformed UTF-16)
3764         // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
3765         encoding = ENCODING_UCS_2_12;
3766         error ("no byte-order mark for UCS-2 entity");
3767 
3768     } else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x00,
3769                  (byte) 0x3f, (byte) 0x00)) {
3770         // UTF-16-LE (otherwise, malformed UTF-16)
3771         // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
3772         encoding = ENCODING_UCS_2_21;
3773         error ("no byte-order mark for UCS-2 entity");
3774     }
3775 
3776     //
3777     // THIRD:  ASCII-derived encodings, fixed and variable lengths
3778     //
3779     else if (tryEncoding (signature, (byte) 0x3c, (byte) 0x3f,
3780                    (byte) 0x78, (byte) 0x6d)) {
3781         // ASCII derived
3782         // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
3783         encoding = ENCODING_UTF_8;
3784         read8bitEncodingDeclaration ();
3785 
3786     } else {
3787         // 4c 6f a7 94 ... we don't understand EBCDIC flavors
3788         // ... but we COULD at least kick in some fixed code page
3789 
3790         // (default) UTF-8 without encoding/XML declaration
3791         encoding = ENCODING_UTF_8;
3792     }
3793     }
3794 
3795 
3796     /***
3797      * Check for a four-byte signature.
3798      * <p>Utility routine for detectEncoding ().
3799      * <p>Always looks for some part of "<?XML" in a specific encoding.
3800      * @param sig The first four bytes read.
3801      * @param b1 The first byte of the signature
3802      * @param b2 The second byte of the signature
3803      * @param b3 The third byte of the signature
3804      * @param b4 The fourth byte of the signature
3805      * @see #detectEncoding
3806      */
3807     private static boolean tryEncoding (
3808     byte sig[], byte b1, byte b2, byte b3, byte b4)
3809     {
3810     return (sig [0] == b1 && sig [1] == b2
3811         && sig [2] == b3 && sig [3] == b4);
3812     }
3813 
3814 
3815     /***
3816      * Check for a two-byte signature.
3817      * <p>Looks for a UCS-2 byte-order mark.
3818      * <p>Utility routine for detectEncoding ().
3819      * @param sig The first four bytes read.
3820      * @param b1 The first byte of the signature
3821      * @param b2 The second byte of the signature
3822      * @see #detectEncoding
3823      */
3824     private static boolean tryEncoding (byte sig[], byte b1, byte b2)
3825     {
3826     return ((sig [0] == b1) && (sig [1] == b2));
3827     }
3828 
3829 
3830     /***
3831      * This method pushes a string back onto input.
3832      * <p>It is useful either as the expansion of an internal entity, 
3833      * or for backtracking during the parse.
3834      * <p>Call pushCharArray () to do the actual work.
3835      * @param s The string to push back onto input.
3836      * @see #pushCharArray
3837      */
3838     private void pushString (String ename, String s)
3839     throws SAXException
3840     {
3841     char ch[] = s.toCharArray ();
3842     pushCharArray (ename, ch, 0, ch.length);
3843     }
3844 
3845 
3846     /***
3847      * Push a new internal input source.
3848      * <p>This method is useful for expanding an internal entity,
3849      * or for unreading a string of characters.  It creates a new
3850      * readBuffer containing the characters in the array, instead
3851      * of characters converted from an input byte stream.
3852      * @param ch The char array to push.
3853      * @see #pushString
3854      * @see #pushURL
3855      * @see #readBuffer
3856      * @see #sourceType
3857      * @see #pushInput
3858      */
3859     private void pushCharArray (String ename, char ch[], int start, int length)
3860     throws SAXException
3861     {
3862     // Push the existing status
3863     pushInput (ename);
3864     sourceType = INPUT_INTERNAL;
3865     readBuffer = ch;
3866     readBufferPos = start;
3867     readBufferLength = length;
3868     readBufferOverflow = -1;
3869     }
3870 
3871 
3872     /***
3873      * Save the current input source onto the stack.
3874      * <p>This method saves all of the global variables associated with
3875      * the current input source, so that they can be restored when a new
3876      * input source has finished.  It also tests for entity recursion.
3877      * <p>The method saves the following global variables onto a stack
3878      * using a fixed-length array:
3879      * <ol>
3880      * <li>sourceType
3881      * <li>externalEntity
3882      * <li>readBuffer
3883      * <li>readBufferPos
3884      * <li>readBufferLength
3885      * <li>line
3886      * <li>encoding
3887      * </ol>
3888      * @param ename The name of the entity (if any) causing the new input.
3889      * @see #popInput
3890      * @see #sourceType
3891      * @see #externalEntity
3892      * @see #readBuffer
3893      * @see #readBufferPos
3894      * @see #readBufferLength
3895      * @see #line
3896      * @see #encoding
3897      */
3898     private void pushInput (String ename)
3899     throws SAXException
3900     {
3901     Object input[] = new Object [12];
3902 
3903     // Check for entity recursion.
3904     if (ename != null) {
3905         Iterator entities = entityStack.iterator ();
3906         while (entities.hasNext ()) {
3907         String e = (String) entities.next ();
3908         if (e == ename) {
3909             error ("recursive reference to entity", ename, null);
3910         }
3911         }
3912     }
3913     entityStack.add (ename);
3914 
3915     // Don't bother if there is no current input.
3916     if (sourceType == INPUT_NONE) {
3917         return;
3918     }
3919 
3920     // Set up a snapshot of the current
3921     // input source.
3922     input [0] = new Integer (sourceType);
3923     input [1] = externalEntity;
3924     input [2] = readBuffer;
3925     input [3] = new Integer (readBufferPos);
3926     input [4] = new Integer (readBufferLength);
3927     input [5] = new Integer (line);
3928     input [6] = new Integer (encoding);
3929     input [7] = new Integer (readBufferOverflow);
3930     input [8] = is;
3931     input [9] = new Integer (currentByteCount);
3932     input [10] = new Integer (column);
3933     input [11] = reader;
3934 
3935     // Push it onto the stack.
3936     inputStack.add (input);
3937     }
3938 
3939 
3940     /***
3941      * Restore a previous input source.
3942      * <p>This method restores all of the global variables associated with
3943      * the current input source.
3944      * @exception java.io.EOFException
3945      *    If there are no more entries on the input stack.
3946      * @see #pushInput
3947      * @see #sourceType
3948      * @see #externalEntity
3949      * @see #readBuffer
3950      * @see #readBufferPos
3951      * @see #readBufferLength
3952      * @see #line
3953      * @see #encoding
3954      */
3955     private void popInput ()
3956     throws SAXException, IOException
3957     {
3958     Object input[];
3959 
3960 
3961     switch (sourceType) {
3962 
3963     case INPUT_EXTERNAL:
3964         if (externalEntity != null) {
3965         handler.endExternalEntity (
3966             externalEntity.getURL ().toString ());
3967         }
3968         break;
3969     case INPUT_STREAM:
3970         if (baseURI != null) {
3971         handler.endExternalEntity (baseURI);
3972         }
3973         is.close ();
3974         break;
3975     case INPUT_READER:
3976         if (baseURI != null) {
3977         handler.endExternalEntity (baseURI);
3978         }
3979         reader.close ();
3980         break;
3981     }
3982 
3983     // Throw an EOFException if there
3984     // is nothing else to pop.
3985     if (inputStack.isEmpty ()) {
3986         throw new EOFException ("no more input");
3987     } else {
3988         String s;
3989         input = (Object[]) inputStack.remove ( inputStack.size() - 1 );
3990         s = (String) entityStack.remove ( entityStack.size() - 1 );
3991     }
3992 
3993     sourceType = ((Integer) input [0]).intValue ();
3994     externalEntity = (URLConnection) input [1];
3995     readBuffer = (char[]) input [2];
3996     readBufferPos = ((Integer) input [3]).intValue ();
3997     readBufferLength = ((Integer) input [4]).intValue ();
3998     line = ((Integer) input [5]).intValue ();
3999     encoding = ((Integer) input [6]).intValue ();
4000     readBufferOverflow = ((Integer) input [7]).intValue ();
4001     is = (InputStream) input [8];
4002     currentByteCount = ((Integer) input [9]).intValue ();
4003     column = ((Integer) input [10]).intValue ();
4004     reader = (Reader) input [11];
4005     }
4006 
4007 
4008     /***
4009      * Return true if we can read the expected character.
4010      * <p>Note that the character will be removed from the input stream
4011      * on success, but will be put back on failure.  Do not attempt to
4012      * read the character again if the method succeeds.
4013      * @param delim The character that should appear next.  For a
4014      *        insensitive match, you must supply this in upper-case.
4015      * @return true if the character was successfully read, or false if
4016      *   it was not.
4017      * @see #tryRead (String)
4018      */
4019     private boolean tryRead (char delim)
4020     throws SAXException, IOException
4021     {
4022     char c;
4023 
4024     // Read the character
4025     c = readCh ();
4026 
4027     // Test for a match, and push the character
4028     // back if the match fails.
4029     if (c == delim) {
4030         return true;
4031     } else {
4032         unread (c);
4033         return false;
4034     }
4035     }
4036 
4037 
4038     /***
4039      * Return true if we can read the expected string.
4040      * <p>This is simply a convenience method.
4041      * <p>Note that the string will be removed from the input stream
4042      * on success, but will be put back on failure.  Do not attempt to
4043      * read the string again if the method succeeds.
4044      * <p>This method will push back a character rather than an
4045      * array whenever possible (probably the majority of cases).
4046      * <p><b>NOTE:</b> This method currently has a hard-coded limit
4047      * of 100 characters for the delimiter.
4048      * @param delim The string that should appear next.
4049      * @return true if the string was successfully read, or false if
4050      *   it was not.
4051      * @see #tryRead (char)
4052      */
4053     private boolean tryRead (String delim)
4054     throws SAXException, IOException
4055     {
4056     char ch[] = delim.toCharArray ();
4057     char c;
4058 
4059     // Compare the input, character-
4060     // by character.
4061 
4062     for (int i = 0; i < ch.length; i++) {
4063         c = readCh ();
4064         if (c != ch [i]) {
4065         unread (c);
4066         if (i != 0) {
4067             unread (ch, i);
4068         }
4069         return false;
4070         }
4071     }
4072     return true;
4073     }
4074 
4075 
4076 
4077     /***
4078      * Return true if we can read some whitespace.
4079      * <p>This is simply a convenience method.
4080      * <p>This method will push back a character rather than an
4081      * array whenever possible (probably the majority of cases).
4082      * @return true if whitespace was found.
4083      */
4084     private boolean tryWhitespace ()
4085     throws SAXException, IOException
4086     {
4087     char c;
4088     c = readCh ();
4089     if (isWhitespace (c)) {
4090         skipWhitespace ();
4091         return true;
4092     } else {
4093         unread (c);
4094         return false;
4095     }
4096     }
4097 
4098 
4099     /***
4100      * Read all data until we find the specified string.
4101      * This is useful for scanning CDATA sections and PIs.
4102      * <p>This is inefficient right now, since it calls tryRead ()
4103      * for every character.
4104      * @param delim The string delimiter
4105      * @see #tryRead (String, boolean)
4106      * @see #readCh
4107      */
4108     private void parseUntil (String delim)
4109     throws SAXException, IOException
4110     {
4111     char c;
4112     int startLine = line;
4113 
4114     try {
4115         while (!tryRead (delim)) {
4116         c = readCh ();
4117         dataBufferAppend (c);
4118         }
4119     } catch (EOFException e) {
4120         error ("end of input while looking for delimiter "
4121         + "(started on line " + startLine
4122         + ')', null, delim);
4123     }
4124     }
4125 
4126 
4127     /***
4128      * Read just the encoding declaration (or XML declaration) at the 
4129      * start of an external entity.
4130      * When this method is called, we know that the declaration is
4131      * present (or appears to be).  We also know that the entity is
4132      * in some sort of ASCII-derived 8-bit encoding.
4133      * The idea of this is to let us read what the 8-bit encoding is
4134      * before we've committed to converting any more of the file; the
4135      * XML or encoding declaration must be in 7-bit ASCII, so we're
4136      * safe as long as we don't go past it.
4137      */
4138     private void read8bitEncodingDeclaration ()
4139     throws SAXException, IOException
4140     {
4141     int ch;
4142     readBufferPos = readBufferLength = 0;
4143 
4144     while (true) {
4145         ch = is.read ();
4146         readBuffer [readBufferLength++] = (char) ch;
4147         switch (ch) {
4148           case (int) '>':
4149         return;
4150           case - 1:
4151         error ("end of file before end of XML or encoding declaration.",
4152                null, "?>");
4153         }
4154         if (readBuffer.length == readBufferLength)
4155         error ("unfinished XML or encoding declaration");
4156     }
4157     }
4158 
4159 
4160     //////////////////////////////////////////////////////////////////////
4161     // Low-level I/O.
4162     //////////////////////////////////////////////////////////////////////
4163 
4164 
4165     /***
4166      * Read a chunk of data from an external input source.
4167      * <p>This is simply a front-end that fills the rawReadBuffer
4168      * with bytes, then calls the appropriate encoding handler.
4169      * @see #encoding
4170      * @see #rawReadBuffer
4171      * @see #readBuffer
4172      * @see #filterCR
4173      * @see #copyUtf8ReadBuffer
4174      * @see #copyIso8859_1ReadBuffer
4175      * @see #copyUcs_2ReadBuffer
4176      * @see #copyUcs_4ReadBuffer
4177      */
4178     private void readDataChunk ()
4179     throws SAXException, IOException
4180     {
4181     int count, i, j;
4182 
4183     // See if we have any overflow (filterCR sets for CR at end)
4184     if (readBufferOverflow > -1) {
4185         readBuffer [0] = (char) readBufferOverflow;
4186         readBufferOverflow = -1;
4187         readBufferPos = 1;
4188         sawCR = true;
4189     } else {
4190         readBufferPos = 0;
4191         sawCR = false;
4192     }
4193 
4194     // input from a character stream.
4195     if (sourceType == INPUT_READER) {
4196         count = reader.read (readBuffer,
4197                 readBufferPos, READ_BUFFER_MAX - readBufferPos);
4198         if (count < 0)
4199         readBufferLength = readBufferPos;
4200         else
4201         readBufferLength = readBufferPos + count;
4202         if (readBufferLength > 0)
4203         filterCR (count >= 0);
4204         sawCR = false;
4205         return;
4206     }
4207 
4208     // Read as many bytes as possible into the raw buffer.
4209     count = is.read (rawReadBuffer, 0, READ_BUFFER_MAX);
4210 
4211     // Dispatch to an encoding-specific reader method to populate
4212     // the readBuffer.  In most parser speed profiles, these routines
4213     // show up at the top of the CPU usage chart.
4214     if (count > 0) {
4215         switch (encoding) {
4216           // one byte builtins
4217           case ENCODING_ASCII:
4218         copyIso8859_1ReadBuffer (count, (char) 0x0080);
4219         break;
4220           case ENCODING_UTF_8:
4221         copyUtf8ReadBuffer (count);
4222         break;
4223           case ENCODING_ISO_8859_1:
4224         copyIso8859_1ReadBuffer (count, (char) 0);
4225         break;
4226 
4227           // two byte builtins
4228           case ENCODING_UCS_2_12:
4229         copyUcs2ReadBuffer (count, 8, 0);
4230         break;
4231           case ENCODING_UCS_2_21:
4232         copyUcs2ReadBuffer (count, 0, 8);
4233         break;
4234 
4235           // four byte builtins
4236           case ENCODING_UCS_4_1234:
4237         copyUcs4ReadBuffer (count, 24, 16, 8, 0);
4238         break;
4239           case ENCODING_UCS_4_4321:
4240         copyUcs4ReadBuffer (count, 0, 8, 16, 24);
4241         break;
4242           case ENCODING_UCS_4_2143:
4243         copyUcs4ReadBuffer (count, 16, 24, 0, 8);
4244         break;
4245           case ENCODING_UCS_4_3412:
4246         copyUcs4ReadBuffer (count, 8, 0, 24, 16);
4247         break;
4248         }
4249     } else
4250         readBufferLength = readBufferPos;
4251 
4252     readBufferPos = 0;
4253 
4254     // Filter out all carriage returns if we've seen any
4255     // (including any saved from a previous read)
4256     if (sawCR) {
4257         filterCR (count >= 0);
4258         sawCR = false;
4259 
4260         // must actively report EOF, lest some CRs get lost.
4261         if (readBufferLength == 0 && count >= 0)
4262         readDataChunk ();
4263     }
4264 
4265     if (count > 0)
4266         currentByteCount += count;
4267     }
4268 
4269 
4270     /***
4271      * Filter carriage returns in the read buffer.
4272      * CRLF becomes LF; CR becomes LF.
4273      * @param moreData true iff more data might come from the same source
4274      * @see #readDataChunk
4275      * @see #readBuffer
4276      * @see #readBufferOverflow
4277      */
4278     private void filterCR (boolean moreData)
4279     {
4280     int i, j;
4281 
4282     readBufferOverflow = -1;
4283 
4284 loop:
4285     for (i = j = readBufferPos; j < readBufferLength; i++, j++) {
4286         switch (readBuffer [j]) {
4287         case '\r':
4288         if (j == readBufferLength - 1) {
4289             if (moreData) {
4290             readBufferOverflow = '\r';
4291             readBufferLength--;
4292             } else  // CR at end of buffer
4293             readBuffer [i++] = '\n';
4294             break loop;
4295         } else if (readBuffer [j + 1] == '\n') {
4296             j++;
4297         }
4298         readBuffer [i] = '\n';
4299         break;
4300 
4301         case '\n':
4302         default:
4303         readBuffer [i] = readBuffer [j];
4304         break;
4305         }
4306     }
4307     readBufferLength = i;
4308     }
4309 
4310     /***
4311      * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
4312      * <p>When readDataChunk () calls this method, the raw bytes are in 
4313      * rawReadBuffer, and the final characters will appear in 
4314      * readBuffer.
4315      * @param count The number of bytes to convert.
4316      * @see #readDataChunk
4317      * @see #rawReadBuffer
4318      * @see #readBuffer
4319      * @see #getNextUtf8Byte
4320      */
4321     private void copyUtf8ReadBuffer (int count)
4322     throws SAXException, IOException
4323     {
4324     int i = 0;
4325     int j = readBufferPos;
4326     int b1;
4327     char    c = 0;
4328 
4329     /*
4330     // check once, so the runtime won't (if it's smart enough)
4331     if (count < 0 || count > rawReadBuffer.length)
4332         throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
4333     */
4334 
4335     while (i < count) {
4336         b1 = rawReadBuffer [i++];
4337 
4338         // Determine whether we are dealing
4339         // with a one-, two-, three-, or four-
4340         // byte sequence.
4341         if (b1 < 0) {
4342         if ((b1 & 0xe0) == 0xc0) {
4343             // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
4344             c = (char) (((b1 & 0x1f) << 6)
4345                 | getNextUtf8Byte (i++, count));
4346         } else if ((b1 & 0xf0) == 0xe0) {
4347             // 3-byte sequence:
4348             // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
4349             // most CJKV characters
4350             c = (char) (((b1 & 0x0f) << 12) |
4351                    (getNextUtf8Byte (i++, count) << 6) |
4352                    getNextUtf8Byte (i++, count));
4353         } else if ((b1 & 0xf8) == 0xf0) {
4354             // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
4355             //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
4356             // (uuuuu = wwww + 1)
4357             // "Surrogate Pairs" ... from the "Astral Planes"
4358             int iso646 = b1 & 07;
4359             iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4360             iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4361             iso646 = (iso646 << 6) + getNextUtf8Byte (i++, count);
4362 
4363             if (iso646 <= 0xffff) {
4364             c = (char) iso646;
4365             } else {
4366             if (iso646 > 0x0010ffff)
4367                 encodingError (
4368                 "UTF-8 value out of range for Unicode",
4369                 iso646, 0);
4370             iso646 -= 0x010000;
4371             readBuffer [j++] = (char) (0xd800 | (iso646 >> 10));
4372             readBuffer [j++] = (char) (0xdc00 | (iso646 & 0x03ff));
4373             continue;
4374             }
4375         } else {
4376             // The five and six byte encodings aren't supported;
4377             // they exceed the Unicode (and XML) range.
4378             encodingError (
4379                 "unsupported five or six byte UTF-8 sequence",
4380                 0xff & b1, i);
4381             // NOTREACHED
4382             c = 0;
4383         }
4384         } else {
4385         // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
4386         // (US-ASCII character, "common" case, one branch to here)
4387         c = (char) b1;
4388         }
4389         readBuffer [j++] = c;
4390         if (c == '\r')
4391         sawCR = true;
4392     }
4393     // How many characters have we read?
4394     readBufferLength = j;
4395     }
4396 
4397 
4398     /***
4399      * Return the next byte value in a UTF-8 sequence.
4400      * If it is not possible to get a byte from the current
4401      * entity, throw an exception.
4402      * @param pos The current position in the rawReadBuffer.
4403      * @param count The number of bytes in the rawReadBuffer
4404      * @return The significant six bits of a non-initial byte in
4405      *   a UTF-8 sequence.
4406      * @exception EOFException If the sequence is incomplete.
4407      */
4408     private int getNextUtf8Byte (int pos, int count)
4409     throws SAXException, IOException
4410     {
4411     int val;
4412 
4413     // Take a character from the buffer
4414     // or from the actual input stream.
4415     if (pos < count) {
4416         val = rawReadBuffer [pos];
4417     } else {
4418         val = is.read ();
4419         if (val == -1) {
4420         encodingError ("unfinished multi-byte UTF-8 sequence at EOF",
4421             -1, pos);
4422         }
4423     }
4424 
4425     // Check for the correct bits at the start.
4426     if ((val & 0xc0) != 0x80) {
4427         encodingError ("bad continuation of multi-byte UTF-8 sequence",
4428             val, pos + 1);
4429     }
4430 
4431     // Return the significant bits.
4432     return (val & 0x3f);
4433     }
4434 
4435 
4436     /***
4437      * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
4438      * UTF-16 characters.
4439      *
4440      * <p>When readDataChunk () calls this method, the raw bytes are in 
4441      * rawReadBuffer, and the final characters will appear in 
4442      * readBuffer.
4443      *
4444      * @param count The number of bytes to convert.
4445      * @param mask For ASCII conversion, 0x7f; else, 0xff.
4446      * @see #readDataChunk
4447      * @see #rawReadBuffer
4448      * @see #readBuffer
4449      */
4450     private void copyIso8859_1ReadBuffer (int count, char mask)
4451     throws IOException
4452     {
4453     int i, j;
4454     for (i = 0, j = readBufferPos; i < count; i++, j++) {
4455         char c = (char) (rawReadBuffer [i] & 0xff);
4456         if ((c & mask) != 0)
4457         throw new CharConversionException ("non-ASCII character U+"
4458                             + Integer.toHexString (c));
4459         readBuffer [j] = c;
4460         if (c == '\r') {
4461         sawCR = true;
4462         }
4463     }
4464     readBufferLength = j;
4465     }
4466 
4467 
4468     /***
4469      * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
4470      * (as used in Java string manipulation).
4471      *
4472      * <p>When readDataChunk () calls this method, the raw bytes are in 
4473      * rawReadBuffer, and the final characters will appear in 
4474      * readBuffer.
4475      * @param count The number of bytes to convert.
4476      * @param shift1 The number of bits to shift byte 1.
4477      * @param shift2 The number of bits to shift byte 2
4478      * @see #readDataChunk
4479      * @see #rawReadBuffer
4480      * @see #readBuffer
4481      */
4482     private void copyUcs2ReadBuffer (int count, int shift1, int shift2)
4483     throws SAXException
4484     {
4485     int j = readBufferPos;
4486 
4487     if (count > 0 && (count % 2) != 0) {
4488         encodingError ("odd number of bytes in UCS-2 encoding", -1, count);
4489     }
4490     // The loops are faster with less internal brancing; hence two
4491     if (shift1 == 0) {  // "UTF-16-LE"
4492         for (int i = 0; i < count; i += 2) {
4493         char c = (char) (rawReadBuffer [i + 1] << 8);
4494         c |= 0xff & rawReadBuffer [i];
4495         readBuffer [j++] = c;
4496         if (c == '\r')
4497             sawCR = true;
4498         }
4499     } else {    // "UTF-16-BE"
4500         for (int i = 0; i < count; i += 2) {
4501         char c = (char) (rawReadBuffer [i] << 8);
4502         c |= 0xff & rawReadBuffer [i + 1];
4503         readBuffer [j++] = c;
4504         if (c == '\r')
4505             sawCR = true;
4506         }
4507     }
4508     readBufferLength = j;
4509     }
4510 
4511 
4512     /***
4513      * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
4514      *
4515      * <p>When readDataChunk () calls this method, the raw bytes are in 
4516      * rawReadBuffer, and the final characters will appear in 
4517      * readBuffer.
4518      * <p>Java has Unicode chars, and this routine uses surrogate pairs
4519      * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
4520      * exception is thrown if the ISO-10646 character has no Unicode
4521      * representation.
4522      *
4523      * @param count The number of bytes to convert.
4524      * @param shift1 The number of bits to shift byte 1.
4525      * @param shift2 The number of bits to shift byte 2
4526      * @param shift3 The number of bits to shift byte 2
4527      * @param shift4 The number of bits to shift byte 2
4528      * @see #readDataChunk
4529      * @see #rawReadBuffer
4530      * @see #readBuffer
4531      */
4532     private void copyUcs4ReadBuffer (int count, int shift1, int shift2,
4533                   int shift3, int shift4)
4534     throws SAXException
4535     {
4536     int j = readBufferPos;
4537     int value;
4538 
4539     if (count > 0 && (count % 4) != 0) {
4540         encodingError (
4541             "number of bytes in UCS-4 encoding not divisible by 4",
4542             -1, count);
4543     }
4544     for (int i = 0; i < count; i += 4) {
4545         value = (((rawReadBuffer [i] & 0xff) << shift1) |
4546               ((rawReadBuffer [i + 1] & 0xff) << shift2) |
4547               ((rawReadBuffer [i + 2] & 0xff) << shift3) |
4548               ((rawReadBuffer [i + 3] & 0xff) << shift4));
4549         if (value < 0x0000ffff) {
4550         readBuffer [j++] = (char) value;
4551         if (value == (int) '\r') {
4552             sawCR = true;
4553         }
4554         } else if (value < 0x0010ffff) {
4555         value -= 0x010000;
4556         readBuffer [j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
4557         readBuffer [j++] = (char) (0xdc | (value & 0x03ff));
4558         } else {
4559         encodingError ("UCS-4 value out of range for Unicode",
4560                    value, i);
4561         }
4562     }
4563     readBufferLength = j;
4564     }
4565 
4566 
4567     /***
4568      * Report a character encoding error.
4569      */
4570     private void encodingError (String message, int value, int offset)
4571     throws SAXException
4572     {
4573     String uri;
4574 
4575     if (value != -1) {
4576         message = message + " (character code: 0x" +
4577               Integer.toHexString (value) + ')';
4578     }
4579     if (externalEntity != null) {
4580         uri = externalEntity.getURL ().toString ();
4581     } else {
4582         uri = baseURI;
4583     }
4584     handler.error (message, uri, -1, offset + currentByteCount);
4585     }
4586 
4587 
4588     //////////////////////////////////////////////////////////////////////
4589     // Local Variables.
4590     //////////////////////////////////////////////////////////////////////
4591 
4592     /***
4593      * Re-initialize the variables for each parse.
4594      */
4595     private void initializeVariables ()
4596     {
4597     // First line
4598     line = 1;
4599     column = 0;
4600 
4601     // Set up the buffers for data and names
4602     dataBufferPos = 0;
4603     dataBuffer = new char [DATA_BUFFER_INITIAL];
4604     nameBufferPos = 0;
4605     nameBuffer = new char [NAME_BUFFER_INITIAL];
4606 
4607     // Set up the DTD hash tables
4608     elementInfo = new HashMap ();
4609     entityInfo = new HashMap ();
4610     notationInfo = new HashMap ();
4611 
4612     // Set up the variables for the current
4613     // element context.
4614     currentElement = null;
4615     currentElementContent = CONTENT_UNDECLARED;
4616 
4617     // Set up the input variables
4618     sourceType = INPUT_NONE;
4619     inputStack = new ArrayList ();
4620     entityStack = new ArrayList ();
4621     externalEntity = null;
4622     tagAttributePos = 0;
4623     tagAttributes = new String [100];
4624     rawReadBuffer = new byte [READ_BUFFER_MAX];
4625     readBufferOverflow = -1;
4626 
4627     inLiteral = false;
4628     expandPE = false;
4629     peIsError = false;
4630 
4631     inCDATA = false;
4632 
4633     symbolTable = new Object [SYMBOL_TABLE_LENGTH][];
4634     }
4635 
4636 
4637     /***
4638      * Clean up after the parse to allow some garbage collection.
4639      */
4640     private void cleanupVariables ()
4641     {
4642     dataBuffer = null;
4643     nameBuffer = null;
4644 
4645     elementInfo = null;
4646     entityInfo = null;
4647     notationInfo = null;
4648 
4649     currentElement = null;
4650 
4651     inputStack = null;
4652     entityStack = null;
4653     externalEntity = null;
4654 
4655     tagAttributes = null;
4656     rawReadBuffer = null;
4657 
4658     symbolTable = null;
4659     }
4660 
4661     //
4662     // The current XML handler interface.
4663     //
4664     private SAXDriver   handler;
4665 
4666     //
4667     // I/O information.
4668     //
4669     private Reader  reader;     // current reader
4670     private InputStream is;         // current input stream
4671     private int     line;       // current line number
4672     private int     column;     // current column number
4673     private int     sourceType;     // type of input source
4674     private ArrayList   inputStack;     // stack of input soruces
4675     private URLConnection externalEntity; // current external entity
4676     private int     encoding;   // current character encoding
4677     private int     currentByteCount; // bytes read from current source
4678 
4679     //
4680     // Buffers for decoded but unparsed character input.
4681     //
4682     private char    readBuffer [];
4683     private int     readBufferPos;
4684     private int     readBufferLength;
4685     private int     readBufferOverflow;  // overflow from last data chunk.
4686 
4687 
4688     //
4689     // Buffer for undecoded raw byte input.
4690     //
4691     private final static int READ_BUFFER_MAX = 16384;
4692     private byte    rawReadBuffer [];
4693 
4694 
4695     //
4696     // Buffer for parsed character data.
4697     //
4698     private static int DATA_BUFFER_INITIAL = 4096;
4699     private char    dataBuffer [];
4700     private int     dataBufferPos;
4701 
4702     //
4703     // Buffer for parsed names.
4704     //
4705     private static int NAME_BUFFER_INITIAL = 1024;
4706     private char    nameBuffer [];
4707     private int     nameBufferPos;
4708 
4709 
4710     //
4711     // HashMaps for DTD information on elements, entities, and notations.
4712     //
4713     private HashMap elementInfo;
4714     private HashMap entityInfo;
4715     private HashMap notationInfo;
4716 
4717 
4718     //
4719     // Element type currently in force.
4720     //
4721     private String  currentElement;
4722     private int     currentElementContent;
4723 
4724     //
4725     // Base external identifiers for resolution.
4726     //
4727     private String  basePublicId;
4728     private String  baseURI;
4729     private int     baseEncoding;
4730     private Reader  baseReader;
4731     private InputStream baseInputStream;
4732     private char    baseInputBuffer [];
4733     private int     baseInputBufferStart;
4734     private int     baseInputBufferLength;
4735 
4736     //
4737     // Stack of entity names, to detect recursion.
4738     //
4739     private ArrayList   entityStack;
4740 
4741     //
4742     // PE expansion is enabled in most chunks of the DTD, not all.
4743     // When it's enabled, literals are treated differently.
4744     //
4745     private boolean inLiteral;
4746     private boolean expandPE;
4747     private boolean peIsError;
4748 
4749     //
4750     // Symbol table, for caching interned names.
4751     //
4752     private final static int SYMBOL_TABLE_LENGTH = 1087;
4753     private Object  symbolTable [][];
4754 
4755     //
4756     // Hash table of attributes found in current start tag.
4757     //
4758     private String  tagAttributes [];
4759     private int     tagAttributePos;
4760 
4761     //
4762     // Utility flag: have we noticed a CR while reading the last
4763     // data chunk?  If so, we will have to go back and normalise
4764     // CR or CR/LF line ends.
4765     //
4766     private boolean sawCR;
4767 
4768     //
4769     // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
4770     // 
4771     private boolean inCDATA;
4772 }
4773