1
2
3
4
5
6
7
8
9
10 package org.dom4j.io;
11
12 import java.io.IOException;
13 import java.io.OutputStream;
14 import java.io.StringWriter;
15 import java.io.UnsupportedEncodingException;
16 import java.io.Writer;
17 import java.util.HashSet;
18 import java.util.Iterator;
19 import java.util.Set;
20
21 import org.dom4j.Document;
22 import org.dom4j.DocumentHelper;
23 import org.dom4j.Element;
24 import org.dom4j.Entity;
25 import org.dom4j.Node;
26 import org.xml.sax.SAXException;
27
28 /*** <p><code>HTMLWriter</code> takes a DOM4J tree and formats it to a
29 * stream as HTML.
30 * This formatter is similar to XMLWriter but it outputs the text of CDATA
31 * and Entity sections rather than the serialised format as in XML,
32 * it has an XHTML mode, it retains whitespace in certain elements such as <PRE>,
33 * and it supports certain elements which have no corresponding close tag such
34 * as for <BR> and <P>.
35 *
36 * <p> The OutputFormat passed in to the constructor is checked for isXHTML() and isExpandEmptyElements().
37 * See {@link OutputFormat OutputFormat} for details. Here are the rules for
38 * <b>this class</b> based on an OutputFormat, "format", passed in to the constructor:<br/><br/>
39 * <ul>
40 * <li>If an element is in {@link #getOmitElementCloseSet() getOmitElementCloseSet}, then it is treated specially:</li>
41 * <ul>
42 * <li>It never expands, since some browsers treat this as two separate Horizontal Rules: <HR></HR></li>
43 * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, then it has a space before the closing single-tag slash, since Netscape 4.x- treats this: <HR /> as
44 * an element named "HR" with an attribute named "/", but that's better than when it refuses to recognize this: <hr/>
45 * which it thinks is an element named "HR/". </li>
46 * </ul>
47 * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, all elements must have
48 * either a close element, or be a closed single tag.</li>
49 * <li>If {@link org.dom4j.io.OutputFormat#isExpandEmptyElements() format.isExpandEmptyElements()}() is true,
50 * all elements are expanded except as above.</li>
51 * </ul>
52 * <b>Examples</b>
53 *
54 * <table border="1" cellpadding="0" cellspacing="0">
55 * <tr>
56 * <th colspan="3" align="left">isXHTML == true</th>
57 * </tr>
58 * <tr>
59 * <td width="25"> </td>
60 * <th align="left">isExpandEmptyElements == true</th>
61 * <td><code>
62 * <td></td><br />
63 * <br /><br />
64 * <foo></foo></code>
65 * </td>
66 * </tr>
67 * <tr>
68 * <td width="25"> </td>
69 * <th align="left">isExpandEmptyElements == false</th>
70 * <td><code>
71 * <td/><br />
72 * <br /><br />
73 * <foo/></code>
74 * </td>
75 * </tr>
76 * <tr>
77 * <th colspan="3" align="left">isXHTML == false</th>
78 * </tr>
79 * <tr>
80 * <td width="25"> </td>
81 * <th align="left">isExpandEmptyElements == true</th>
82 * <td><code>
83 * <td></td><br />
84 * <br><br />
85 * <foo></foo></code>
86 * </td>
87 * </tr>
88 * <tr>
89 * <td width="25"> </td>
90 * <th align="left">isExpandEmptyElements == false</th>
91 * <td><code>
92 * <td/><br />
93 * <br><br />
94 * <foo/></code>
95 * </td>
96 * </tr>
97 * </table>
98 * <p>
99 * <p>
100 * If isXHTML == true, CDATA sections look like this:
101 * <PRE>
102 * <b><myelement><![CDATA[My data]]></myelement></b>
103 * </PRE>
104 * Otherwise, they look like this:
105 * <PRE>
106 * <b><myelement>My data</myelement></b>
107 * </PRE>
108 * </p>
109 *
110 * Basically, {@link org.dom4j.io.OutputFormat#isXHTML() OutputFormat.isXHTML()} == true will produce valid XML,
111 * while {@link org.dom4j.io.OutputFormat#isExpandEmptyElements() format.isExpandEmptyElements()}
112 * determines whether empty elements are expanded
113 * if isXHTML is true, excepting the special HTML single tags.
114 * </p>
115 *
116 *
117 * <p>Also, HTMLWriter handles tags whose contents should be preformatted, that is, whitespace-preserved.
118 * By default, this set includes the tags <PRE>, <SCRIPT>, <STYLE>, and <TEXTAREA>, case insensitively.
119 * It does not include <IFRAME>.
120 * Other tags, such as <CODE>, <KBD>, <TT>, <VAR>, are usually rendered in a different font in most browsers,
121 * but don't preserve whitespace, so they also don't appear in the default list. HTML Comments
122 * are always whitespace-preserved. However, the parser you use may store comments with linefeed-only
123 * text nodes (\n) even if your platform uses another line.separator character, and HTMLWriter outputs
124 * Comment nodes exactly as the DOM is set up by the parser.
125 * See examples and discussion here: {@link #setPreformattedTags(java.util.Set) setPreformattedTags}</p>
126 *
127 * <p><b>Examples</b></p>
128 * <blockquote>
129 * <p><b>Pretty Printing</b></p>
130 * <p>This example shows how to pretty print a string containing a valid HTML document to a string.
131 * You can also just call the static methods of this class:<br/>
132 * {@link #prettyPrintHTML(String) prettyPrintHTML(String)}
133 * or<br/>
134 * {@link #prettyPrintHTML(String,boolean,boolean,boolean,boolean) prettyPrintHTML(String,boolean,boolean,boolean,boolean)}
135 * or, <br/>
136 * {@link #prettyPrintXHTML(String) prettyPrintXHTML(String)} for XHTML (note the X)
137 * </p>
138 * <pre>
139 * String testPrettyPrint(String html){
140 * StringWriter sw = new StringWriter();
141 * org.dom4j.io.OutputFormat format = org.dom4j.io.OutputFormat.createPrettyPrint();
142 * <font color='green'>//These are the default formats from createPrettyPrint, so you needn't set them:</font>
143 * <font color='green'>// format.setNewlines(true);</font>
144 * <font color='green'>// format.setTrimText(true);</font>
145 * format.setXHTML(true); <font color='green'>//Default is false, this produces XHTML</font>
146 * org.dom4j.io.HTMLWriter writer = new org.dom4j.io.HTMLWriter(sw, format);
147 * org.dom4j.Document document = org.dom4j.DocumentHelper.parseText(html);
148 * writer.write(document);
149 * writer.flush();
150 * return sw.toString();
151 * }
152 * </pre>
153 *
154 * <p>This example shows how to create a "squeezed" document, but one that will work in browsers
155 * even if the browser line length is limited. No newlines are included, no extra whitespace
156 * at all, except where it it required by {@link #setPreformattedTags(java.util.Set) setPreformattedTags}.
157 * </p>
158 * <pre>
159 * String testCrunch(String html){
160 * StringWriter sw = new StringWriter();
161 * org.dom4j.io.OutputFormat format = org.dom4j.io.OutputFormat.createPrettyPrint();
162 * format.setNewlines(false);
163 * format.setTrimText(true);
164 * format.setIndent("");
165 * format.setXHTML(true);
166 * format.setExpandEmptyElements(false);
167 * format.setNewLineAfterNTags(20); <font color='green'>//print a line every so often.</font>
168 * org.dom4j.io.HTMLWriter writer = new org.dom4j.io.HTMLWriter(sw, format);
169 * org.dom4j.Document document = org.dom4j.DocumentHelper.parseText(html);
170 * writer.write(document);
171 * writer.flush();
172 * return sw.toString();
173 * }
174 * </pre>
175 *
176 * </blockquote>
177 *
178 * </p>
179 *
180 * @author <a href="mailto:james.strachan@metastuff.com">James Strachan</a> (james.strachan@metastuff.com)
181 * @author Laramie Crocker
182 * @version $Revision: 1.19 $
183 */
184 public class HTMLWriter extends XMLWriter {
185
186 public HTMLWriter(Writer writer) {
187 super( writer, defaultHtmlFormat );
188 }
189
190 public HTMLWriter(Writer writer, OutputFormat format) {
191 super( writer, format );
192 }
193
194 public HTMLWriter() throws UnsupportedEncodingException {
195 super( defaultHtmlFormat );
196 }
197
198 public HTMLWriter(OutputFormat format) throws UnsupportedEncodingException {
199 super( format );
200 }
201
202 public HTMLWriter(OutputStream out) throws UnsupportedEncodingException {
203 super( out, defaultHtmlFormat );
204 }
205
206 public HTMLWriter(OutputStream out, OutputFormat format) throws UnsupportedEncodingException {
207 super( out, format );
208 }
209
210
211
212 private class FormatState {
213 public FormatState(boolean newLines, boolean trimText, String indent){
214 this.m_Newlines = newLines;
215 this.m_TrimText = trimText;
216 this.m_indent = indent;
217 }
218 private boolean m_Newlines = false;
219 public boolean isNewlines(){return m_Newlines;}
220 private boolean m_TrimText = false;
221 public boolean isTrimText(){return m_TrimText;}
222 private String m_indent = "";
223 public String getIndent(){return m_indent;}
224 }
225
226
227
228 private java.util.Stack m_formatStack = new java.util.Stack();
229
230 private static String m_lineSeparator = System.getProperty("line.separator");
231
232 private String m_lastText = "";
233
234 private int m_tagsOuput = 0;
235
236 private int m_newLineAfterNTags = -1;
237
238 protected static final HashSet defaultPreformattedTags;
239
240 static {
241
242
243 defaultPreformattedTags = new HashSet();
244 defaultPreformattedTags.add("PRE");
245 defaultPreformattedTags.add("SCRIPT");
246 defaultPreformattedTags.add("STYLE");
247 defaultPreformattedTags.add("TEXTAREA");
248 }
249
250 private HashSet preformattedTags = defaultPreformattedTags;
251
252 protected static final OutputFormat defaultHtmlFormat;
253
254 static {
255 defaultHtmlFormat = new OutputFormat( " ", true );
256 defaultHtmlFormat.setTrimText( true );
257 defaultHtmlFormat.setSuppressDeclaration( true );
258 }
259
260 /*** Used to store the qualified element names which
261 * should have no close element tag
262 */
263 private HashSet omitElementCloseSet;
264
265 public void startCDATA() throws SAXException {
266 }
267
268 public void endCDATA() throws SAXException {
269 }
270
271
272
273
274 protected void writeCDATA(String text) throws IOException {
275
276
277 if ( getOutputFormat().isXHTML() ) {
278 super.writeCDATA(text);
279 } else {
280 writer.write( text );
281 }
282 lastOutputNodeType = Node.CDATA_SECTION_NODE;
283 }
284
285 protected void writeEntity(Entity entity) throws IOException {
286 writer.write(entity.getText());
287 lastOutputNodeType = Node.ENTITY_REFERENCE_NODE;
288 }
289
290 protected void writeDeclaration() throws IOException {
291 }
292
293 protected void writeString(String text) throws IOException {
294
295
296
297
298
299
300
301
302 if ( text.equals("\n")){
303 if ( ! m_formatStack.empty() ) {
304 super.writeString(m_lineSeparator);
305 }
306 return;
307 }
308 m_lastText = text;
309 if ( m_formatStack.empty() ) {
310 super.writeString(text.trim());
311 } else {
312 super.writeString(text);
313 }
314 }
315
316 /*** Overriden method to not close certain element names to avoid
317 * wierd behaviour from browsers for versions up to 5.x
318 */
319 protected void writeClose(String qualifiedName) throws IOException {
320 if ( ! omitElementClose( qualifiedName ) ) {
321 super.writeClose(qualifiedName);
322 }
323 }
324
325 protected void writeEmptyElementClose(String qualifiedName) throws IOException {
326 if (getOutputFormat().isXHTML()){
327
328 if ( omitElementClose(qualifiedName) ) {
329
330
331
332 writer.write(" />");
333 } else {
334 super.writeEmptyElementClose(qualifiedName);
335 }
336 } else {
337
338 if ( omitElementClose(qualifiedName) ) {
339
340 writer.write(">");
341 } else {
342
343 super.writeEmptyElementClose(qualifiedName);
344 }
345 }
346 }
347
348 protected boolean omitElementClose( String qualifiedName ) {
349 return internalGetOmitElementCloseSet().contains( qualifiedName.toUpperCase() );
350 }
351
352 private HashSet internalGetOmitElementCloseSet() {
353 if (omitElementCloseSet == null) {
354 omitElementCloseSet = new HashSet();
355 loadOmitElementCloseSet(omitElementCloseSet);
356 }
357 return omitElementCloseSet;
358 }
359
360
361 protected void loadOmitElementCloseSet(Set set) {
362 set.add( "AREA" );
363 set.add( "BASE" );
364 set.add( "BR" );
365 set.add( "COL" );
366 set.add( "HR" );
367 set.add( "IMG" );
368 set.add( "INPUT" );
369 set.add( "LINK" );
370 set.add( "META" );
371 set.add( "P" );
372 set.add( "PARAM" );
373 }
374
375
376 /*** A clone of the Set of elements that can have their close-tags omitted. By default it
377 * should be
378 * "AREA",
379 * "BASE",
380 * "BR",
381 * "COL",
382 * "HR",
383 * "IMG",
384 * "INPUT",
385 * "LINK",
386 * "META",
387 * "P",
388 * "PARAM"
389 * @return A clone of the Set.
390 */
391 public Set getOmitElementCloseSet(){
392 return (Set)(internalGetOmitElementCloseSet().clone());
393 }
394
395 /*** To use the empty set, pass an empty Set, or null:
396 * <pre>
397 * setOmitElementCloseSet(new HashSet());
398 * or
399 * setOmitElementCloseSet(null);
400 * </pre>
401 */
402 public void setOmitElementCloseSet(Set newSet){
403 omitElementCloseSet = new HashSet();
404 if (newSet != null){
405 omitElementCloseSet = new HashSet();
406 Object aTag;
407 Iterator iter = newSet.iterator();
408 while ( iter.hasNext() ) {
409 aTag = iter.next();
410 if (aTag != null){
411 omitElementCloseSet.add(aTag.toString().toUpperCase());
412 }
413 }
414
415 }
416 }
417
418 /*** @see #setPreformattedTags(java.util.Set) setPreformattedTags
419 */
420 public Set getPreformattedTags(){
421 return (Set)(preformattedTags.clone());
422 }
423
424 /***
425 * <p>Override the default set, which includes PRE, SCRIPT, STYLE, and TEXTAREA, case insensitively.</p>
426 *
427 * <p><b>Setting Preformatted Tags</b></p>
428 *
429 *
430 * <p>Pass in a Set of Strings, one for each tag name that should be treated like a PRE tag.
431 * You may pass in null or an empty Set to assign the empty set, in which case no tags
432 * will be treated as preformatted, except that HTML Comments will continue to be preformatted.
433 * If a tag is included in the set of preformatted tags, all whitespace within the tag will be preserved,
434 * including whitespace on the same line preceding the close tag. This will generally make the close tag
435 * not line up with the start tag, but it preserves the intention of the whitespace within the tag.
436 * </p>
437 * <p>The browser considers leading whitespace before the close tag to be significant,
438 * but leading whitespace before the open tag to be insignificant.
439 * For example, if the HTML author doesn't put the close TEXTAREA tag flush to the left margin,
440 * then the TEXTAREA control in the browser will have spaces on the last line inside the control. This may be
441 * the HTML author's intent. Similarly, in a PRE, the browser treats a flushed left close PRE tag as different from
442 * a close tag with leading whitespace. Again, this must be left up to the HTML author.</p>
443 *
444 * <p><b>Examples</b></p>
445 * <blockquote>
446 * <p>
447 * Here is an example of how you can set the PreformattedTags list using setPreformattedTags
448 * to include IFRAME, as well as the default set,
449 * if you have an instance of this class named myHTMLWriter:
450 * <pre>
451 * Set current = myHTMLWriter.getPreformattedTags();
452 * current.add("IFRAME");
453 * myHTMLWriter.setPreformattedTags(current);
454 *
455 * <font color='green'>//The set is now <b>{PRE, SCRIPT, STYLE, TEXTAREA, IFRAME}</b></font>
456 * </pre>
457 *
458 * Similarly, you can simply replace it with your own:
459 * <pre>
460 * HashSet newset = new HashSet();
461 * newset.add("PRE");
462 * newset.add("TEXTAREA");
463 * myHTMLWriter.setPreformattedTags(newset);
464 *
465 * <font color='green'>//The set is now <b>{PRE, TEXTAREA}</b></font>
466 * </pre>
467 *
468 * You can remove all tags from the preformatted tags list, with an empty set, like this:
469 * <pre>
470 * myHTMLWriter.setPreformattedTags(new HashSet());
471 *
472 * <font color='green'>//The set is now <b>{}</b></font>
473 * </pre>
474 *
475 * or with null, like this:
476 * <pre>
477 * myHTMLWriter.setPreformattedTags(null);
478 *
479 * <font color='green'>//The set is now <b>{}</b></font>
480 * </pre>
481 *
482 * </blockquote>
483 *
484 */
485 public void setPreformattedTags(Set newSet){
486
487
488 preformattedTags = new HashSet();
489 if ( newSet != null ) {
490 Object aTag;
491 Iterator iter = newSet.iterator();
492 while ( iter.hasNext() ) {
493 aTag = iter.next();
494 if (aTag != null){
495 preformattedTags.add(aTag.toString().toUpperCase());
496 }
497 }
498 }
499 }
500
501
502 /***
503 * @return true if the qualifiedName passed in matched (case-insensitively)
504 * a tag in the preformattedTags set,
505 * or false if not found or if the set is empty or null.
506 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
507 */
508 public boolean isPreformattedTag(String qualifiedName){
509
510
511 return (preformattedTags != null) && (preformattedTags.contains(qualifiedName.toUpperCase()));
512 }
513
514 /*** This override handles any elements that should not remove whitespace,
515 * such as <PRE>, <SCRIPT>, <STYLE>, and <TEXTAREA>.
516 * Note: the close tags won't line up with the open tag, but we can't alter that.
517 * See javadoc note at setPreformattedTags.
518 *
519 * @see #setPreformattedTags(java.util.Set) setPreformattedTags
520 * @throws java.io.IOException When the stream could not be written to.
521 *
522 */
523 protected void writeElement(Element element) throws IOException {
524 if ( m_newLineAfterNTags == -1 ) {
525 lazyInitNewLinesAfterNTags();
526 }
527 if ( m_newLineAfterNTags > 0 ) {
528 if ( (m_tagsOuput>0) && (m_tagsOuput % m_newLineAfterNTags == 0)) {
529 super.writer.write(m_lineSeparator);
530 }
531 }
532 m_tagsOuput++;
533
534 String qualifiedName = element.getQualifiedName();
535 String saveLastText = m_lastText;
536 int size = element.nodeCount();
537 if ( isPreformattedTag(qualifiedName) ) {
538 OutputFormat currentFormat = getOutputFormat();
539 boolean saveNewlines = currentFormat.isNewlines();
540 boolean saveTrimText = currentFormat.isTrimText();
541 String currentIndent = currentFormat.getIndent();
542
543 m_formatStack.push(new FormatState(saveNewlines, saveTrimText, currentIndent));
544 try {
545 super.writePrintln();
546 if ( saveLastText.trim().length() == 0 && currentIndent != null && currentIndent.length()>0) {
547
548
549
550
551 super.writer.write(justSpaces(saveLastText));
552 }
553 currentFormat.setNewlines(false);
554 currentFormat.setTrimText(false);
555 currentFormat.setIndent("");
556
557 super.writeElement(element);
558 } finally {
559 FormatState state = (FormatState)m_formatStack.pop();
560 currentFormat.setNewlines(state.isNewlines());
561 currentFormat.setTrimText(state.isTrimText());
562 currentFormat.setIndent(state.getIndent());
563 }
564 } else {
565 super.writeElement(element);
566 }
567 }
568
569 private String justSpaces(String text){
570 int size = text.length();
571 StringBuffer res = new StringBuffer(size);
572 char c;
573 for (int i=0; i < size; i++) {
574 c = text.charAt(i);
575 switch ( c ) {
576 case '\r':
577 case '\n':
578 continue;
579 default:
580 res.append(c);
581 }
582 }
583 return res.toString();
584 }
585
586 private void lazyInitNewLinesAfterNTags(){
587 if ( getOutputFormat().isNewlines() ) {
588 m_newLineAfterNTags = 0;
589 } else {
590 m_newLineAfterNTags = getOutputFormat().getNewLineAfterNTags();
591 }
592 }
593
594
595
596 /*** Convenience method to just get a String result.
597 *
598 * @return a pretty printed String from the source string,
599 * preserving whitespace in the defaultPreformattedTags set,
600 * and leaving the close tags off of the default omitElementCloseSet set.
601 *
602 * Use one of the write methods if you want stream output.
603 * @throws java.io.IOException
604 * @throws java.io.UnsupportedEncodingException
605 * @throws org.dom4j.DocumentException
606 */
607 public static String prettyPrintHTML(String html)
608 throws java.io.IOException, java.io.UnsupportedEncodingException, org.dom4j.DocumentException {
609 return prettyPrintHTML(html, true, true, false, true);
610 }
611
612 /*** Convenience method to just get a String result, but <b>As XHTML</b>.
613 *
614 * @return a pretty printed String from the source string,
615 * preserving whitespace in the defaultPreformattedTags set,
616 * but conforming to XHTML: no close tags are omitted (though if empty, they will
617 * be converted to XHTML empty tags: <HR/>
618 *
619 * Use one of the write methods if you want stream output.
620 * @throws java.io.IOException
621 * @throws java.io.UnsupportedEncodingException
622 * @throws org.dom4j.DocumentException
623 */
624 public static String prettyPrintXHTML(String html)
625 throws java.io.IOException, java.io.UnsupportedEncodingException, org.dom4j.DocumentException {
626 return prettyPrintHTML(html, true, true, true, false);
627 }
628
629 /*** @return a pretty printed String from the source string,
630 * preserving whitespace in the defaultPreformattedTags set,
631 * and leaving the close tags off of the default omitElementCloseSet set.
632 * This override allows you to specify various formatter options.
633 * Use one of the write methods if you want stream output.
634 * @throws java.io.IOException
635 * @throws java.io.UnsupportedEncodingException
636 * @throws org.dom4j.DocumentException
637 */
638 public static String prettyPrintHTML(String html,
639 boolean newlines,
640 boolean trim,
641 boolean isXHTML,
642 boolean expandEmpty)
643 throws java.io.IOException, java.io.UnsupportedEncodingException, org.dom4j.DocumentException {
644 StringWriter sw = new StringWriter();
645 OutputFormat format = OutputFormat.createPrettyPrint();
646 format.setNewlines(newlines);
647 format.setTrimText(trim);
648 format.setXHTML(isXHTML);
649 format.setExpandEmptyElements(expandEmpty);
650 HTMLWriter writer = new HTMLWriter(sw, format);
651 Document document = DocumentHelper.parseText(html);
652 writer.write(document);
653 writer.flush();
654 return sw.toString();
655 }
656
657 }
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757