//------------------------------------------------------------------------------
// Copyright (c) 2005, 2006 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.common.html;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;

import org.eclipse.epf.common.utils.FileUtil;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;

/**
 * Pretty-formats HTML source and make it XHTML compliant.
 * 
 * @author Kelvin Low
 * @since 1.0
 */
public class HTMLFormatter {

	private static final String HTML_BODY_START_TAG = "<body>"; //$NON-NLS-1$

	private static final String HTML_BODY_END_TAG = "</body>"; //$NON-NLS-1$

	private static final int HTML_BODY_START_TAG_LENGTH = HTML_BODY_START_TAG
			.length();

	private static final String HTML_COPY = "&copy;";//$NON-NLS-1$

	private static final String HTML_EURO = "&euro;";//$NON-NLS-1$ 	

	private static final String HTML_REG = "&reg;";//$NON-NLS-1$ 

	private static final String HTML_TRADEMARK = "&trade;";//$NON-NLS-1$

	private int lineWidth;

	private boolean indent;

	private int indentSize;

	/**
	 * Creates a new instance.
	 */
	public HTMLFormatter() {
		this(132, true, 4);
	}

	/**
	 * Creates a new instance.
	 */
	public HTMLFormatter(int lineWidth, boolean indent, int indentSize) {
		this.lineWidth = lineWidth;
		this.indent = indent;
		this.indentSize = indentSize;
	}

	/**
	 * Sets the maximum character width of a line.
	 * 
	 * @param lineWidth
	 *            The line width (in number of characters).
	 */
	public void setLineWidth(int lineWidth) {
		this.lineWidth = lineWidth;
	}

	/**
	 * Enables or disables tags indent.
	 * 
	 * @param indent
	 *            If true, ident the tags.
	 */
	public void setIndent(boolean indent) {
		this.indent = indent;
	}

	/**
	 * Sets the indent size.
	 * 
	 * @param indentSize
	 *            The indent size (in number of characters).
	 */
	public void setIndentSize(int indentSize) {
		this.indentSize = indentSize;
	}

	/**
	 * Formats the given HTML source.
	 * 
	 * @param html
	 *            The HTML source.
	 * @return The pretty-formatted HTML source.
	 */
	public String formatHTML(String html) throws Exception {
		if (html == null || html.length() == 0) {
			return html;
		}

		Tidy tidy = new Tidy();
		tidy.setXHTML(true);
		tidy.setDropEmptyParas(false);
		tidy.setDropFontTags(false);
		tidy.setQuiet(true);
		tidy.setShowWarnings(false);
		tidy.setSmartIndent(false);
		tidy.setTidyMark(false);
		tidy.setWraplen(lineWidth);
		tidy.setIndentAttributes(false);
		tidy.setIndentContent(indent);
		tidy.setSpaces(indentSize);
		tidy.setCharEncoding(Configuration.UTF8);

		ByteArrayInputStream input = new ByteArrayInputStream(html
				.getBytes("UTF-8")); //$NON-NLS-1$	
		ByteArrayOutputStream output = new ByteArrayOutputStream();

		StringWriter sw = new StringWriter();
		PrintWriter pw = new PrintWriter(sw);
		tidy.setErrout(pw);
		tidy.parse(input, output);
		String error = sw.getBuffer().toString();
		if (error != null && error.length() > 0
				&& error.startsWith("line") && error.indexOf("column") > 0) { //$NON-NLS-1$ //$NON-NLS-2$
			throw new Exception(error);
		}

		String formattedHTML = new String(output.toByteArray(), "UTF-8"); //$NON-NLS-1$
		formattedHTML = escapeHTML(formattedHTML);

		if (!html.startsWith("<!DOCTYPE") && !html.startsWith("<html")) { //$NON-NLS-1$ //$NON-NLS-2$
			int start = formattedHTML.indexOf(HTML_BODY_START_TAG);
			int end = formattedHTML.indexOf(HTML_BODY_END_TAG);
			if (start == -1 || end == -1) {
				return ""; //$NON-NLS-1$
			}
			start += HTML_BODY_START_TAG_LENGTH;
			if (start >= end) {
				return ""; //$NON-NLS-1$
			}
			start += FileUtil.LINE_SEP_LENGTH;
			end -= FileUtil.LINE_SEP_LENGTH;
			if (indent && indentSize > 0) {
				end -= indentSize;
			}
			if (start >= end) {
				return ""; //$NON-NLS-1$
			}
			String result = formattedHTML.substring(start, end);
			if (indent && indentSize > 0) {
				String indentStr = getIndentStr(indentSize * 2);
				result = fixIndentation(result, indentStr);
				return result;
			}
		}
		return formattedHTML;
	}

	/**
	 * Returns the indent string.
	 */
	private static String getIndentStr(int indentLength) {
		if (indentLength == 0) {
			return ""; //$NON-NLS-1$
		}
		StringBuffer indentStr = new StringBuffer();
		for (int i = 0; i < indentLength; i++) {
			indentStr.append(' ');
		}
		return indentStr.toString();
	}

	/**
	 * Escapes HTML special characters that are not handled correctly by JTidy.
	 * 
	 * @param html
	 *            The HTML source.
	 * @return The HTML source with HTML special characters preserved in escaped
	 *         form.
	 */
	private static String escapeHTML(String html) {
		if (html == null || html.length() == 0)
			return ""; //$NON-NLS-1$
		StringBuffer sb = new StringBuffer();
		int len = html.length();
		for (int i = 0; i < len; i++) {
			char ch = html.charAt(i);
			switch (ch) {
			case '\u00a9':
				sb.append(HTML_COPY);
				break;
			case '\u00ae':
				sb.append(HTML_REG);
				break;
			case '\u20ac':
				sb.append(HTML_EURO);
				break;
			case '\u2122':
				sb.append(HTML_TRADEMARK);
				break;
			default:
				sb.append(ch);
				break;
			}
		}
		return sb.toString();
	}

	public static final String PRE_TAG_START = "<pre>"; //$NON-NLS-1$
	public static final String PRE_TAG_END = "</pre>"; //$NON-NLS-1$
	public static final int PRE_TAG_END_LENGTH = PRE_TAG_END.length();
	/**
	 * Undo the JTidy indent, but ignore &lt;pre&gt; tags
	 * @param html
	 * @param indentStr
	 * @return
	 */
	private static String fixIndentation(String html, String indentStr) {
		if (html.startsWith(indentStr)) {
			html = html.substring(indentStr.length());
		}
		StringBuffer strBuf = new StringBuffer();
		int pre_index = -1;
		int last_pre_end_index = -1;
		while ((pre_index = html.indexOf(PRE_TAG_START, last_pre_end_index)) != -1) {
			strBuf.append(html.substring(last_pre_end_index < 0 ? 0 : last_pre_end_index + PRE_TAG_END_LENGTH, pre_index).replaceAll("\r\n" + indentStr, "\r\n"));
			last_pre_end_index = html.indexOf(PRE_TAG_END, pre_index);
			if (last_pre_end_index != -1) {
				strBuf.append(html.substring(pre_index, last_pre_end_index + PRE_TAG_END_LENGTH));
			}
			else {
				// found <pre>, but no ending </pre> - shouldn't ever get here
				// append rest of string and return it
				strBuf.append(html.substring(pre_index));
				return strBuf.toString();
			}
		}
		strBuf.append(html.substring(last_pre_end_index < 0 ? 0 : last_pre_end_index + PRE_TAG_END_LENGTH).replaceAll("\r\n" + indentStr, "\r\n"));
		return strBuf.toString();
	}
}
