root/trunk/src/java/org/jcoderz/commons/doclet/HtmlCleaner.java

Revision 1011, 5.1 kB (checked in by amandel, 4 years ago)

Aligned svn keyword settings.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
Line 
1/*
2 * $Id$
3 *
4 * Copyright 2006, The jCoderZ.org Project. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
9 *
10 *    * Redistributions of source code must retain the above copyright
11 *      notice, this list of conditions and the following disclaimer.
12 *    * Redistributions in binary form must reproduce the above
13 *      copyright notice, this list of conditions and the following
14 *      disclaimer in the documentation and/or other materials
15 *      provided with the distribution.
16 *    * Neither the name of the jCoderZ.org Project nor the names of
17 *      its contributors may be used to endorse or promote products
18 *      derived from this software without specific prior written
19 *      permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
28 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
29 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
31 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33package org.jcoderz.commons.doclet;
34
35import java.io.ByteArrayInputStream;
36import java.io.ByteArrayOutputStream;
37import java.io.InputStream;
38import java.io.PrintWriter;
39import java.io.StringWriter;
40import java.util.logging.Level;
41import java.util.logging.Logger;
42
43import org.w3c.tidy.Configuration;
44import org.w3c.tidy.Tidy;
45
46/**
47 * This class provides an easy interface to jTidy to clean up
48 * html fragments as used within javadoc.
49 *
50 * @author Andreas Mandel
51 */
52public class HtmlCleaner
53{
54   /** The full qualified name of this class. */
55   private static final String CLASSNAME = HtmlCleaner.class.getName();
56
57   /** The logger to use. */
58   private static final Logger logger = Logger.getLogger(CLASSNAME);
59
60   private static final String FIX_HEADER
61         = "<html><head><title>clean</title></head><body>";
62
63   private static final String FIX_FOOTER
64         = "</body></html>";
65
66   private String mWarnings = "";
67   private boolean mHasErrors = false;
68
69   /**
70    * Converts the given HTML fragment string into wellformed xhtml.
71    * @param in the html fragment to be cleaned up.
72    * @return a cleaned up wellformed xhtml version of the in string.
73    */
74   public String clean (CharSequence in)
75   {
76      if (logger.isLoggable(Level.FINER))
77      {
78         logger.entering(CLASSNAME, "clean(CharSequence)", in);
79      }
80      mHasErrors = false;
81      final Tidy tidy = new Tidy();
82      final String inData = FIX_HEADER + in + FIX_FOOTER;
83      final StringWriter err = new StringWriter();
84      String result = null;
85      try
86      {
87         tidy.setCharEncoding(Configuration.UTF8);
88         tidy.setMakeClean(true);
89         tidy.setXmlOut(true);
90         tidy.setRawOut(true);
91         tidy.setNumEntities(true);
92         tidy.setWraplen(0); // do not care about line length
93         // tidy.setOnlyErrors(true);
94         tidy.setErrout(new PrintWriter(err));
95
96         final InputStream inStream = new ByteArrayInputStream(
97               inData.getBytes("utf-8"));
98
99         final ByteArrayOutputStream out = new ByteArrayOutputStream();
100
101         tidy.parse(inStream, out);
102
103         final String resultString = new String(out.toByteArray(), "utf-8");
104
105         final int start = resultString.indexOf("<body>");
106         final int end = resultString.lastIndexOf("</body>");
107
108         if (start != -1 && end != -1)
109         {
110            result = resultString.substring(
111                  start + "<body>\n".length(), end).trim();
112         }
113         else
114         {
115            result = "Invalid HTML could not be parsed.";
116         }
117
118         if (tidy.getParseWarnings() == 0 && tidy.getParseErrors() == 0)
119         {
120            mWarnings = "";
121         }
122         else
123         {
124            mWarnings = err.toString();
125         }
126         mHasErrors = (tidy.getParseErrors() == 0);
127      }
128      catch (Exception ex)
129      {
130         result = "Invalid HTML could not be parsed.";
131         err.write(result);
132         err.write("Got exception:");
133         err.write(ex.toString());
134         ex.printStackTrace(new PrintWriter(err));
135         mWarnings = err.toString();
136         logger.log(Level.FINER,
137               "Could not handle html fragment. '" + in + "'." , ex);
138         mHasErrors = true;
139      }
140      if (logger.isLoggable(Level.FINER))
141      {
142         logger.exiting(CLASSNAME, "clean(CharSequence)", result);
143      }
144      return result;
145   }
146
147   /**
148    * Returns the warnings encountered during last clean.
149    * @return the warnings encountered during last clean.
150    */
151   public String getWarnings ()
152   {
153      return mWarnings;
154   }
155
156   public boolean hasErrors ()
157   {
158      return mHasErrors;
159   }
160}
Note: See TracBrowser for help on using the browser.