1
2
3
4
5
6
7
8
9
10 package org.dom4j.io;
11
12 import java.io.File;
13 import java.io.FileInputStream;
14 import java.io.FileNotFoundException;
15 import java.io.InputStream;
16 import java.io.Reader;
17 import java.io.Serializable;
18 import java.net.URL;
19
20 import org.dom4j.Document;
21 import org.dom4j.DocumentException;
22 import org.dom4j.DocumentFactory;
23 import org.dom4j.ElementHandler;
24 import org.xml.sax.EntityResolver;
25 import org.xml.sax.ErrorHandler;
26 import org.xml.sax.InputSource;
27 import org.xml.sax.SAXException;
28 import org.xml.sax.SAXParseException;
29 import org.xml.sax.XMLFilter;
30 import org.xml.sax.XMLReader;
31 import org.xml.sax.helpers.DefaultHandler;
32 import org.xml.sax.helpers.XMLReaderFactory;
33
34 /*** <p><code>SAXReader</code> creates a DOM4J tree from SAX parsing events.</p>
35 *
36 * <p>The actual SAX parser that is used by this class is configurable
37 * so you can use your favourite SAX parser if you wish. DOM4J comes
38 * configured with its own SAX parser so you do not need to worry about
39 * configuring the SAX parser.</p>
40 *
41 * <p>To explicitly configure the SAX parser that is used via Java code you
42 * can use a constructor or use the
43 * {@link #setXMLReader(XMLReader)} or
44 * {@link #setXMLReaderClassName(String)} methods.</p>
45 *
46 * <p>If the parser is not specified explicitly then the standard SAX
47 * policy of using the <code>org.xml.sax.driver</code> system property is
48 * used to determine the implementation class of {@link XMLReader}.</p>
49 *
50 * <p>If the <code>org.xml.sax.driver</code> system property is not defined
51 * then JAXP is used via reflection (so that DOM4J is not explicitly dependent
52 * on the JAXP classes) to load the JAXP configured SAXParser.
53 * If there is any error creating a JAXP SAXParser an informational message is
54 * output and then the default (Aelfred) SAX parser is used instead.</p>
55 *
56 * <p>If you are trying to use JAXP to explicitly set your SAX parser
57 * and are experiencing problems, you can turn on verbose error reporting
58 * by defining the system property <code>org.dom4j.verbose</code> to be "true"
59 * which will output a more detailed description of why JAXP could not find a
60 * SAX parser</p>
61 *
62 * <p>
63 * For more information on JAXP please go to
64 * <a href="http://java.sun.com/xml/">Sun's Java & XML site</a></p>
65 *
66 * @author <a href="mailto:james.strachan@metastuff.com">James Strachan</a>
67 * @version $Revision: 1.55 $
68 */
69 public class SAXReader {
70
71 /*** <code>DocumentFactory</code> used to create new document objects */
72 private DocumentFactory factory;
73
74 /*** <code>XMLReader</code> used to parse the SAX events */
75 private XMLReader xmlReader;
76
77 /*** Whether validation should occur */
78 private boolean validating;
79
80 /*** DispatchHandler to call when each <code>Element</code> is encountered */
81 private DispatchHandler dispatchHandler;
82
83 /*** ErrorHandler class to use */
84 private ErrorHandler errorHandler;
85
86 /*** The entity resolver */
87 private EntityResolver entityResolver;
88
89 /*** Should element & attribute names and namespace URIs be interned? */
90 private boolean stringInternEnabled = true;
91
92 /*** Should internal DTD declarations be expanded into a List in the DTD */
93 private boolean includeInternalDTDDeclarations = false;
94
95 /*** Should external DTD declarations be expanded into a List in the DTD */
96 private boolean includeExternalDTDDeclarations = false;
97
98 /*** Whether adjacent text nodes should be merged */
99 private boolean mergeAdjacentText = false;
100
101 /*** Holds value of property stripWhitespaceText. */
102 private boolean stripWhitespaceText = false;
103
104 /*** Should we ignore comments */
105 private boolean ignoreComments = false;
106
107
108
109
110
111 /*** The SAX filter used to filter SAX events */
112 private XMLFilter xmlFilter;
113
114
115 public SAXReader() {
116 }
117
118 public SAXReader(boolean validating) {
119 this.validating = validating;
120 }
121
122 public SAXReader(DocumentFactory factory) {
123 this.factory = factory;
124 }
125
126 public SAXReader(DocumentFactory factory, boolean validating) {
127 this.factory = factory;
128 this.validating = validating;
129 }
130
131 public SAXReader(XMLReader xmlReader) {
132 this.xmlReader = xmlReader;
133 }
134
135 public SAXReader(XMLReader xmlReader, boolean validating) {
136 this.xmlReader = xmlReader;
137 this.validating = validating;
138 }
139
140 public SAXReader(String xmlReaderClassName) throws SAXException {
141 if (xmlReaderClassName != null) {
142 this.xmlReader = XMLReaderFactory.createXMLReader(xmlReaderClassName);
143 }
144 }
145
146 public SAXReader(String xmlReaderClassName, boolean validating) throws SAXException {
147 if (xmlReaderClassName != null) {
148 this.xmlReader = XMLReaderFactory.createXMLReader(xmlReaderClassName);
149 }
150 this.validating = validating;
151 }
152
153
154
155 /*** Allows a SAX property to be set on the underlying SAX parser.
156 * This can be useful to set parser-specific properties
157 * such as the location of schema or DTD resources.
158 * Though use this method with caution as it has the possibility
159 * of breaking the standard behaviour.
160 * An alternative to calling this method is to correctly configure an
161 * XMLReader object instance and call the {@link #setXMLReader(XMLReader)} method
162 *
163 * @param name is the SAX property name
164 * @param value is the value of the SAX property
165 * @throws SAXException if the XMLReader could not be created or
166 * the property could not be changed.
167 */
168 public void setProperty(String name, Object value) throws SAXException {
169 getXMLReader().setProperty(name, value);
170 }
171
172
173 /*** Sets a SAX feature on the underlying SAX parser.
174 * This can be useful to set parser-specific features.
175 * Though use this method with caution as it has the possibility
176 * of breaking the standard behaviour.
177 * An alternative to calling this method is to correctly configure an
178 * XMLReader object instance and call the {@link #setXMLReader(XMLReader)} method
179 *
180 * @param name is the SAX feature name
181 * @param value is the value of the SAX feature
182 * @throws SAXException if the XMLReader could not be created or
183 * the feature could not be changed.
184 */
185 public void setFeature(String name, boolean value) throws SAXException {
186 getXMLReader().setFeature(name, value);
187 }
188
189
190 /*** <p>Reads a Document from the given <code>File</code></p>
191 *
192 * @param file is the <code>File</code> to read from.
193 * @return the newly created Document instance
194 * @throws DocumentException if an error occurs during parsing.
195 */
196 public Document read(File file) throws DocumentException {
197 try {
198
199
200
201
202
203
204
205
206 InputSource source = new InputSource(new FileInputStream(file));
207 String path = file.getAbsolutePath();
208 if (path != null) {
209
210
211 StringBuffer sb = new StringBuffer("file://");
212
213 if (!path.startsWith(File.separator)) {
214 sb.append("/");
215 }
216
217 path = path.replace('//', '/');
218 sb.append(path);
219
220 source.setSystemId(sb.toString());
221 }
222 return read(source);
223 } catch (FileNotFoundException e) {
224 throw new DocumentException(e.getMessage(), e);
225 }
226 }
227
228 /*** <p>Reads a Document from the given <code>URL</code> using SAX</p>
229 *
230 * @param url <code>URL</code> to read from.
231 * @return the newly created Document instance
232 * @throws DocumentException if an error occurs during parsing.
233 */
234 public Document read(URL url) throws DocumentException {
235 String systemID = url.toExternalForm();
236 return read(new InputSource(systemID));
237 }
238
239 /*** <p>Reads a Document from the given URL or filename using SAX.</p>
240 *
241 * <p>
242 * If the systemId contains a <code>':'</code> character then it is
243 * assumed to be a URL otherwise its assumed to be a file name.
244 * If you want finer grained control over this mechansim then please
245 * explicitly pass in either a {@link URL} or a {@link File} instance
246 * instead of a {@link String} to denote the source of the document.
247 * </p>
248 *
249 * @param systemId is a URL for a document or a file name.
250 * @return the newly created Document instance
251 * @throws DocumentException if an error occurs during parsing.
252 */
253 public Document read(String systemId) throws DocumentException {
254 return read(new InputSource(systemId));
255 }
256
257 /*** <p>Reads a Document from the given stream using SAX</p>
258 *
259 * @param in <code>InputStream</code> to read from.
260 * @return the newly created Document instance
261 * @throws DocumentException if an error occurs during parsing.
262 */
263 public Document read(InputStream in) throws DocumentException {
264 return read(new InputSource(in));
265 }
266
267 /*** <p>Reads a Document from the given <code>Reader</code> using SAX</p>
268 *
269 * @param reader is the reader for the input
270 * @return the newly created Document instance
271 * @throws DocumentException if an error occurs during parsing.
272 */
273 public Document read(Reader reader) throws DocumentException {
274 return read(new InputSource(reader));
275 }
276
277 /*** <p>Reads a Document from the given stream using SAX</p>
278 *
279 * @param in <code>InputStream</code> to read from.
280 * @param systemId is the URI for the input
281 * @return the newly created Document instance
282 * @throws DocumentException if an error occurs during parsing.
283 */
284 public Document read(InputStream in, String systemId) throws DocumentException {
285 InputSource source = new InputSource(in);
286 source.setSystemId(systemId);
287 return read(source);
288 }
289
290 /*** <p>Reads a Document from the given <code>Reader</code> using SAX</p>
291 *
292 * @param reader is the reader for the input
293 * @param systemId is the URI for the input
294 * @return the newly created Document instance
295 * @throws DocumentException if an error occurs during parsing.
296 */
297 public Document read(Reader reader, String systemId) throws DocumentException {
298 InputSource source = new InputSource(reader);
299 source.setSystemId(systemId);
300 return read(source);
301 }
302
303 /*** <p>Reads a Document from the given <code>InputSource</code> using SAX</p>
304 *
305 * @param in <code>InputSource</code> to read from.
306 * @return the newly created Document instance
307 * @throws DocumentException if an error occurs during parsing.
308 */
309 public Document read(InputSource in) throws DocumentException {
310 try {
311 XMLReader xmlReader = getXMLReader();
312
313 xmlReader = installXMLFilter(xmlReader);
314
315 EntityResolver thatEntityResolver = this.entityResolver;
316 if (thatEntityResolver==null) {
317 thatEntityResolver = createDefaultEntityResolver( in.getSystemId() );
318 this.entityResolver=thatEntityResolver;
319 }
320 xmlReader.setEntityResolver( thatEntityResolver );
321
322 SAXContentHandler contentHandler = createContentHandler(xmlReader);
323 contentHandler.setEntityResolver( thatEntityResolver );
324 contentHandler.setInputSource( in );
325 contentHandler.setIncludeInternalDTDDeclarations( isIncludeInternalDTDDeclarations() );
326 contentHandler.setIncludeExternalDTDDeclarations( isIncludeExternalDTDDeclarations() );
327 contentHandler.setMergeAdjacentText( isMergeAdjacentText() );
328 contentHandler.setStripWhitespaceText( isStripWhitespaceText() );
329 contentHandler.setIgnoreComments( isIgnoreComments() );
330 xmlReader.setContentHandler(contentHandler);
331
332 configureReader(xmlReader, contentHandler);
333
334 xmlReader.parse(in);
335 return contentHandler.getDocument();
336 }
337 catch (Exception e) {
338 if (e instanceof SAXParseException) {
339
340 SAXParseException parseException = (SAXParseException) e;
341 String systemId = parseException.getSystemId();
342 if ( systemId == null ) {
343 systemId = "";
344 }
345 String message = "Error on line "
346 + parseException.getLineNumber()
347 + " of document " + systemId
348 + " : " + parseException.getMessage();
349
350 throw new DocumentException(message, e);
351 }
352 else {
353 throw new DocumentException(e.getMessage(), e);
354 }
355 }
356 }
357
358
359
360
361
362
363 /*** @return the validation mode, true if validating will be done
364 * otherwise false.
365 */
366 public boolean isValidating() {
367 return validating;
368 }
369
370 /*** Sets the validation mode.
371 *
372 * @param validating indicates whether or not validation should occur.
373 */
374 public void setValidation(boolean validating) {
375 this.validating = validating;
376 }
377
378 /*** @return whether internal DTD declarations should be expanded into the DocumentType
379 * object or not.
380 */
381 public boolean isIncludeInternalDTDDeclarations() {
382 return includeInternalDTDDeclarations;
383 }
384
385 /*** Sets whether internal DTD declarations should be expanded into the DocumentType
386 * object or not.
387 *
388 * @param includeInternalDTDDeclarations whether or not DTD declarations should be expanded
389 * and included into the DocumentType object.
390 */
391 public void setIncludeInternalDTDDeclarations(boolean includeInternalDTDDeclarations) {
392 this.includeInternalDTDDeclarations = includeInternalDTDDeclarations;
393 }
394
395 /*** @return whether external DTD declarations should be expanded into the DocumentType
396 * object or not.
397 */
398 public boolean isIncludeExternalDTDDeclarations() {
399 return includeExternalDTDDeclarations;
400 }
401
402 /*** Sets whether DTD external declarations should be expanded into the DocumentType
403 * object or not.
404 *
405 * @param includeExternalDTDDeclarations whether or not DTD declarations should be expanded
406 * and included into the DocumentType object.
407 */
408 public void setIncludeExternalDTDDeclarations(boolean includeExternalDTDDeclarations) {
409 this.includeExternalDTDDeclarations = includeExternalDTDDeclarations;
410 }
411
412 /*** Sets whether String interning
413 * is enabled or disabled for element & attribute names and namespace URIs.
414 * This proprety is enabled by default.
415 */
416 public boolean isStringInternEnabled() {
417 return stringInternEnabled;
418 }
419
420 /*** Sets whether String interning
421 * is enabled or disabled for element & attribute names and namespace URIs
422 */
423 public void setStringInternEnabled(boolean stringInternEnabled) {
424 this.stringInternEnabled = stringInternEnabled;
425 }
426
427 /*** Returns whether adjacent text nodes should be merged together.
428 * @return Value of property mergeAdjacentText.
429 */
430 public boolean isMergeAdjacentText() {
431 return mergeAdjacentText;
432 }
433
434 /*** Sets whether or not adjacent text nodes should be merged
435 * together when parsing.
436 * @param mergeAdjacentText New value of property mergeAdjacentText.
437 */
438 public void setMergeAdjacentText(boolean mergeAdjacentText) {
439 this.mergeAdjacentText = mergeAdjacentText;
440 }
441
442 /*** Sets whether whitespace between element start and end tags should be ignored
443 *
444 * @return Value of property stripWhitespaceText.
445 */
446 public boolean isStripWhitespaceText() {
447 return stripWhitespaceText;
448 }
449
450 /*** Sets whether whitespace between element start and end tags should be ignored.
451 *
452 * @param stripWhitespaceText New value of property stripWhitespaceText.
453 */
454 public void setStripWhitespaceText(boolean stripWhitespaceText) {
455 this.stripWhitespaceText = stripWhitespaceText;
456 }
457
458 /***
459 * Returns whether we should ignore comments or not.
460 * @return boolean
461 */
462 public boolean isIgnoreComments() {
463 return ignoreComments;
464 }
465
466 /***
467 * Sets whether we should ignore comments or not.
468 * @param ignoreComments whether we should ignore comments or not.
469 */
470 public void setIgnoreComments(boolean ignoreComments) {
471 this.ignoreComments = ignoreComments;
472 }
473
474
475 /*** @return the <code>DocumentFactory</code> used to create document objects
476 */
477 public DocumentFactory getDocumentFactory() {
478 if (factory == null) {
479 factory = DocumentFactory.getInstance();
480 }
481 return factory;
482 }
483
484 /*** <p>This sets the <code>DocumentFactory</code> used to create new documents.
485 * This method allows the building of custom DOM4J tree objects to be implemented
486 * easily using a custom derivation of {@link DocumentFactory}</p>
487 *
488 * @param factory <code>DocumentFactory</code> used to create DOM4J objects
489 */
490 public void setDocumentFactory(DocumentFactory factory) {
491 this.factory = factory;
492 }
493
494 /*** @return the <code>ErrorHandler</code> used by SAX
495 */
496 public ErrorHandler getErrorHandler() {
497 return errorHandler;
498 }
499
500 /*** Sets the <code>ErrorHandler</code> used by the SAX
501 * <code>XMLReader</code>.
502 *
503 * @param errorHandler is the <code>ErrorHandler</code> used by SAX
504 */
505 public void setErrorHandler(ErrorHandler errorHandler) {
506 this.errorHandler = errorHandler;
507 }
508
509 /*** Returns the current entity resolver used to resolve entities
510 */
511 public EntityResolver getEntityResolver() {
512 return entityResolver;
513 }
514
515 /*** Sets the entity resolver used to resolve entities.
516 */
517 public void setEntityResolver(EntityResolver entityResolver) {
518 this.entityResolver = entityResolver;
519 }
520
521 /*** @return the <code>XMLReader</code> used to parse SAX events
522 */
523 public XMLReader getXMLReader() throws SAXException {
524 if (xmlReader == null) {
525 xmlReader = createXMLReader();
526 }
527 return xmlReader;
528 }
529
530 /*** Sets the <code>XMLReader</code> used to parse SAX events
531 *
532 * @param xmlReader is the <code>XMLReader</code> to parse SAX events
533 */
534 public void setXMLReader(XMLReader xmlReader) {
535 this.xmlReader = xmlReader;
536 }
537
538 /*** Sets the class name of the <code>XMLReader</code> to be used
539 * to parse SAX events.
540 *
541 * @param xmlReaderClassName is the class name of the <code>XMLReader</code>
542 * to parse SAX events
543 */
544 public void setXMLReaderClassName(String xmlReaderClassName) throws SAXException {
545 setXMLReader( XMLReaderFactory.createXMLReader(xmlReaderClassName) );
546 }
547
548
549 /*** Adds the <code>ElementHandler</code> to be called when the
550 * specified path is encounted.
551 *
552 * @param path is the path to be handled
553 * @param handler is the <code>ElementHandler</code> to be called
554 * by the event based processor.
555 */
556 public void addHandler(String path, ElementHandler handler) {
557 getDispatchHandler().addHandler(path, handler);
558 }
559
560 /*** Removes the <code>ElementHandler</code> from the event based
561 * processor, for the specified path.
562 *
563 * @param path is the path to remove the <code>ElementHandler</code> for.
564 */
565 public void removeHandler(String path) {
566 getDispatchHandler().removeHandler(path);
567 }
568
569 /*** When multiple <code>ElementHandler</code> instances have been
570 * registered, this will set a default <code>ElementHandler</code>
571 * to be called for any path which does <b>NOT</b> have a handler
572 * registered.
573 * @param handler is the <code>ElementHandler</code> to be called
574 * by the event based processor.
575 */
576 public void setDefaultHandler(ElementHandler handler) {
577 getDispatchHandler().setDefaultHandler(handler);
578 }
579
580 /***
581 * This method clears out all the existing handlers and default handler
582 * setting things back as if no handler existed. Useful when reusing an
583 * object instance.
584 */
585 public void resetHandlers() {
586 getDispatchHandler().resetHandlers();
587 }
588
589 /*** Returns the SAX filter being used to filter SAX events.
590 *
591 * @return the SAX filter being used or null if no SAX filter is installed
592 */
593 public XMLFilter getXMLFilter() {
594 return xmlFilter;
595 }
596
597 /*** Sets the SAX filter to be used when filtering SAX events
598 *
599 * @param xmlFilter is the SAX filter to use or null to disable filtering
600 */
601 public void setXMLFilter(XMLFilter xmlFilter) {
602 this.xmlFilter = xmlFilter;
603 }
604
605
606
607
608 /*** Installs any XMLFilter objects required to allow the SAX event stream
609 * to be filtered and preprocessed before it gets to dom4j.
610 *
611 * @return the new XMLFilter if applicable or the original XMLReader if no
612 * filter is being used.
613 */
614 protected XMLReader installXMLFilter(XMLReader xmlReader) {
615 XMLFilter xmlFilter = getXMLFilter();
616 if ( xmlFilter != null ) {
617
618 XMLFilter root = xmlFilter;
619 while (true) {
620 XMLReader parent = root.getParent();
621 if ( parent instanceof XMLFilter ) {
622 root = (XMLFilter) parent;
623 }
624 else {
625 break;
626 }
627 }
628 root.setParent(xmlReader);
629 return xmlFilter;
630 }
631 return xmlReader;
632 }
633
634
635 protected DispatchHandler getDispatchHandler() {
636 if (dispatchHandler == null) {
637 dispatchHandler = new DispatchHandler();
638 }
639 return dispatchHandler;
640 }
641
642 protected void setDispatchHandler(DispatchHandler dispatchHandler) {
643 this.dispatchHandler = dispatchHandler;
644 }
645
646 /*** Factory Method to allow alternate methods of
647 * creating and configuring XMLReader objects
648 */
649 protected XMLReader createXMLReader() throws SAXException {
650 return SAXHelper.createXMLReader( isValidating() );
651 }
652
653 /*** Configures the XMLReader before use */
654 protected void configureReader(XMLReader reader, DefaultHandler contentHandler) throws DocumentException {
655
656 SAXHelper.setParserProperty(
657 reader,
658 "http://xml.org/sax/handlers/LexicalHandler",
659 contentHandler
660 );
661
662
663 SAXHelper.setParserProperty(
664 reader,
665 "http://xml.org/sax/properties/lexical-handler",
666 contentHandler
667 );
668
669
670 if ( includeInternalDTDDeclarations || includeExternalDTDDeclarations ) {
671 SAXHelper.setParserProperty(
672 reader,
673 "http://xml.org/sax/properties/declaration-handler",
674 contentHandler
675 );
676 }
677
678
679 SAXHelper.setParserFeature(
680 reader,
681 "http://xml.org/sax/features/namespaces",
682 true
683 );
684
685 SAXHelper.setParserFeature(
686 reader,
687 "http://xml.org/sax/features/namespace-prefixes",
688 false
689 );
690
691
692 SAXHelper.setParserFeature(
693 reader,
694 "http://xml.org/sax/features/string-interning",
695 isStringInternEnabled()
696 );
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712 SAXHelper.setParserFeature(
713 reader,
714 "http://xml.org/sax/features/use-locator2",
715 true
716 );
717
718 try {
719
720 reader.setFeature(
721 "http://xml.org/sax/features/validation",
722 isValidating()
723 );
724 if (errorHandler != null) {
725 reader.setErrorHandler(errorHandler);
726 }
727 else {
728 reader.setErrorHandler(contentHandler);
729 }
730 }
731 catch (Exception e) {
732 if (isValidating()) {
733 throw new DocumentException(
734 "Validation not supported for XMLReader: " + reader,
735 e
736 );
737 }
738
739 }
740 }
741
742 /*** Factory Method to allow user derived SAXContentHandler objects to be used
743 */
744 protected SAXContentHandler createContentHandler(XMLReader reader) {
745 return new SAXContentHandler(
746 getDocumentFactory(), dispatchHandler
747 );
748 }
749
750 protected EntityResolver createDefaultEntityResolver( String documentSystemId ) {
751 String prefix = null;
752 if ( documentSystemId != null && documentSystemId.length() > 0 ) {
753 int idx = documentSystemId.lastIndexOf( '/' );
754 if ( idx > 0 ) {
755 prefix = documentSystemId.substring(0, idx+1);
756
757 }
758 }
759 return new SAXEntityResolver(prefix);
760 }
761
762 protected static class SAXEntityResolver implements EntityResolver, Serializable {
763 String uriPrefix;
764
765 public SAXEntityResolver(String uriPrefix) {
766 this.uriPrefix = uriPrefix;
767 }
768
769 public InputSource resolveEntity(String publicId, String systemId) {
770
771 if ( systemId != null && systemId.length() > 0 ) {
772 if ( uriPrefix != null && systemId.indexOf( ':' ) <= 0 ) {
773 systemId = uriPrefix + systemId;
774 }
775 }
776 return new InputSource(systemId);
777 }
778 }
779
780
781
782 }
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830