/***********************************************************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors: Juergen Schumacher (empolis GmbH) - initial API and implementation Drazen Cindric (Attensity Europe
 * GmbH) - data model improvements
 **********************************************************************************************************************/

package org.eclipse.smila.processing.pipelets.test;

import java.io.InputStream;
import java.util.List;

import javax.xml.bind.JAXBException;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
import org.eclipse.smila.processing.pipelets.HtmlToTextPipelet;
import org.eclipse.smila.processing.pipelets.SourceType;
import org.eclipse.smila.utils.config.ConfigUtils;

/**
 * Test for {@link HtmlToTextPipelet}.
 */
public class TestHtmlToTextPipelet extends ATransformationPipeletTest {
  /**
   * bundle name for config loading.
   */
  public static final String CONFIG_BUNDLE = "org.eclipse.smila.processing.pipelets";

  /**
   * name of configuration to work on attachments.
   */
  public static final String CONFIG_ATTACHMENT = "html-to-text-by-attachment.xml";

  /**
   * createAttributesConfiguration name of configuration to work on attributes.
   */
  public static final String CONFIG_ATTRIBUTE = "html-to-text-by-attribute.xml";

  /**
   * name of configuration that removes header tags.
   */
  public static final String CONFIG_REMOVE = "html-to-text-remove-headers.xml";

  /**
   * name of configuration that extracts metadata.
   */
  public static final String CONFIG_METADATA = "html-to-text-metadata.xml";

  /**
   * name of directory containing test html files.
   */
  public static final String CONFIG_DATADIR = "html";

  /** The log. */
  private final Log _log = LogFactory.getLog(getClass());

  /**
   * create and configure HtmlToText pipelet.
   *
   * @param configuration
   *          {@link AnyMap} with configuration
   * @return configured pipelet.
   * @throws ProcessingException
   *           error configuring pipelet
   * @throws JAXBException
   *           error loading config
   */
  public HtmlToTextPipelet createPipelet(final AnyMap configuration) throws ProcessingException, JAXBException {
    final HtmlToTextPipelet pipelet = new HtmlToTextPipelet();
    pipelet.configure(configuration);
    return pipelet;
  }

  /**
   * a very simple first test with attributes.
   *
   * @throws Exception
   *           test failed
   */
  public void testHelloWorldAttribute() throws Exception {
    final AnyMap config = createAttributesConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
    final String id = createBlackboardRecord("htmltotext", "hello-world-attribute");
    AnyMap anyMap = getBlackboard().getMetadata(id);
    anyMap.put(pipelet.getInputName(paramAccessor),
      anyMap.getFactory().createStringValue("<html>Hello World!</html>"));
    pipelet.process(getBlackboard(), new String[] { id });
    anyMap = getBlackboard().getMetadata(id);
    assertEquals("Hello World!", anyMap.getStringValue(pipelet.getOutputName(paramAccessor)));
  }

  /**
   * a very simple first test with attributes with HTML entities.
   *
   * @throws Exception
   *           test failed
   */
  public void testHelloWorldUmlautAttribute() throws Exception {
    final AnyMap config = createAttributesConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
    final String id = createBlackboardRecord("htmltotext", "hello-world-attribute");
    AnyMap anyMap = getBlackboard().getMetadata(id);
    anyMap.put(pipelet.getInputName(paramAccessor),
      anyMap.getFactory().createStringValue("<html>H&auml;llo W&ouml;rld!</html>"));
    pipelet.process(getBlackboard(), new String[] { id });
    anyMap = getBlackboard().getMetadata(id);
    assertEquals("H\u00e4llo W\u00f6rld!", anyMap.getStringValue(pipelet.getOutputName(paramAccessor)));
  }

  /**
   * a very simple first test with attachments. storeResults
   *
   * @throws Exception
   *           test failed
   */
  public void testHelloWorldAttachment() throws Exception {
    final AnyMap config = createAttachmentsConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
    final String id = createBlackboardRecord("htmltotext", "hello-world-attachment");
    final byte[] html = "<html>Hello World!</html>".getBytes(ATransformationPipelet.ENCODING_ATTACHMENT);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    final byte[] text = getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor));
    assertEquals("Hello World!", new String(text, ATransformationPipelet.ENCODING_ATTACHMENT));
  }

  /**
   * a very simple first test with attachments with HTML entities.
   *
   * @throws Exception
   *           test failed
   */
  public void testHelloWorldUmlautAttachment() throws Exception {
    final AnyMap config = createAttachmentsConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
    final String id = createBlackboardRecord("htmltotext", "hello-world-attachment");
    final byte[] html = "<html>H&auml;llo W&ouml;rld!</html>".getBytes(ATransformationPipelet.ENCODING_ATTACHMENT);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    final byte[] text = getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor));
    assertEquals("H\u00e4llo W\u00f6rld!", new String(text, ATransformationPipelet.ENCODING_ATTACHMENT));
  }

  /**
   * a test with real umlauts.
   */
  public void testCorrectMetaEncodingAttachment() throws Exception {
    final AnyMap configuration = createAttachmentsConfiguration();
    final String id = createBlackboardRecord("htmltotext", "default-encoding-attachment");
    final byte[] html =
      "<html><meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1' />H\u00e4llo W\u00f6rld!</html>"
        .getBytes("iso-8859-1");
    HtmlToTextPipelet pipelet = createPipelet(configuration);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    String text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertEquals("H\u00e4llo W\u00f6rld!", text);

    configuration.put("defaultEncoding", "utf-8");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertEquals("H\u00e4llo W\u00f6rld!", text);
  }

  /**
   * a test with real umlauts.
   */
  public void testNoMetaEncodingUtfAttachment() throws Exception {
    final AnyMap configuration = createAttachmentsConfiguration();
    final String id = createBlackboardRecord("htmltotext", "default-encoding-attachment");
    final byte[] html = "<html>H\u00e4llo W\u00f6rld!</html>".getBytes("utf-8");
    HtmlToTextPipelet pipelet = createPipelet(configuration);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    String text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertFalse("H\u00e4llo W\u00f6rld!".equals(text));

    configuration.put("defaultEncoding", "utf-8");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertEquals("H\u00e4llo W\u00f6rld!", text);

    configuration.put("defaultEncoding", "iso-8859-1");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertFalse("H\u00e4llo W\u00f6rld!".equals(text));
  }

  /**
   * a test with real umlauts.
   */
  public void testIncorrectMetaEncodingAttachment() throws Exception {
    final AnyMap configuration = createAttachmentsConfiguration();
    final String id = createBlackboardRecord("htmltotext", "default-encoding-attachment");
    final byte[] html =
      "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />H\u00e4llo W\u00f6rld!</html>"
        .getBytes("iso-8859-1");
    HtmlToTextPipelet pipelet = createPipelet(configuration);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    String text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertFalse("H\u00e4llo W\u00f6rld!".equals(text));

    configuration.put("defaultEncoding", "iso-8859-1");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertFalse("H\u00e4llo W\u00f6rld!".equals(text));

    configuration.put("defaultEncoding", "utf-8");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertFalse("H\u00e4llo W\u00f6rld!".equals(text));
  }

  /**
   * a test with real umlauts.
   */
  public void testIsoDefaultEncodingAttachment() throws Exception {
    final AnyMap configuration = createAttachmentsConfiguration();
    final String id = createBlackboardRecord("htmltotext", "default-encoding-attachment");
    final byte[] html = "<html>H\u00e4llo W\u00f6rld!</html>".getBytes("iso-8859-1");

    HtmlToTextPipelet pipelet = createPipelet(configuration);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    String text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertEquals("H\u00e4llo W\u00f6rld!", text);

    configuration.put("defaultEncoding", "iso-8859-1");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertEquals("H\u00e4llo W\u00f6rld!", text);

    configuration.put("defaultEncoding", "utf-8");
    pipelet = createPipelet(configuration);
    getBlackboard().setAttachment(id, pipelet.getInputName(paramAccessor), html);
    pipelet.process(getBlackboard(), new String[] { id });
    text =
      new String(getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor)),
        ATransformationPipelet.ENCODING_ATTACHMENT);
    assertFalse("H\u00e4llo W\u00f6rld!".equals(text));
  }

  /**
   * a test of configurable content removing.
   *
   * @throws Exception
   *           test failed
   */
  public void testRemoveHeaders() throws Exception {
    final AnyMap configKeep = createAttachmentsConfiguration();
    final HtmlToTextPipelet pipeletKeep = createPipelet(configKeep);
    final ParameterAccessor paramAccessorKeep = new ParameterAccessor(getBlackboard(), configKeep);
    final AnyMap configRemove = createConfigurationForRemoveHeaders();
    final HtmlToTextPipelet pipeletRemove = createPipelet(configRemove);
    final ParameterAccessor paramAccessorRemove = new ParameterAccessor(getBlackboard(), configRemove);

    String id = createBlackboardRecord("htmltotext", "keep-headers");
    getBlackboard().setAttachmentFromStream(id, pipeletKeep.getInputName(paramAccessorKeep),
      ConfigUtils.getConfigStream(CONFIG_BUNDLE, CONFIG_DATADIR + "/headers.html"));
    pipeletKeep.process(getBlackboard(), new String[] { id });
    final byte[] textBytes = getBlackboard().getAttachmentAsBytes(id, pipeletKeep.getOutputName(paramAccessorKeep));
    assertNotNull(textBytes);
    final String textString = new String(textBytes, ATransformationPipelet.ENCODING_ATTACHMENT);
    assertTrue(textString.indexOf("Hello World!") > 0);
    assertTrue(textString.indexOf("Hello Earth!") > 0);
    assertTrue(textString.indexOf("Hello Europe!") > 0);
    assertTrue(textString.indexOf("Hello!") > 0);

    id = createBlackboardRecord("htmltotext", "remove-headers");
    getBlackboard().setAttachmentFromStream(id, pipeletKeep.getInputName(paramAccessorKeep),
      ConfigUtils.getConfigStream(CONFIG_BUNDLE, CONFIG_DATADIR + "/headers.html"));
    pipeletRemove.process(getBlackboard(), new String[] { id });
    final AnyMap anyMap = getBlackboard().getMetadata(id);
    assertEquals("Hello!", anyMap.getStringValue(pipeletRemove.getOutputName(paramAccessorRemove)).trim());
  }

  /**
   * a test of metadata extraction.
   *
   * @throws Exception
   *           test failed
   */
  public void testExtractMetadata() throws Exception {
    final AnyMap config = createConfigurationForMetadata();
    final HtmlToTextPipelet pipeletKeep = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);

    final String id = createBlackboardRecord("htmltotext", "extract-metadata");
    getBlackboard().setAttachmentFromStream(id, pipeletKeep.getInputName(paramAccessor),
      ConfigUtils.getConfigStream(CONFIG_BUNDLE, CONFIG_DATADIR + "/meta.html"));
    pipeletKeep.process(getBlackboard(), new String[] { id });
    final AnyMap anyMap = getBlackboard().getMetadata(id);
    assertEquals("Hello World!", anyMap.getStringValue(pipeletKeep.getOutputName(paramAccessor)).trim());

    final AnySeq keywordsSeq = anyMap.getSeq("keywords");
    assertNotNull(keywordsSeq);
    assertEquals(1, keywordsSeq.size());
    assertEquals("cat", keywordsSeq.getStringValue(0).trim());

    final AnySeq authorsSeq = anyMap.getSeq("authors");
    assertEquals(3, authorsSeq.size());
    assertEquals("me", authorsSeq.getStringValue(0));
    assertEquals("you", authorsSeq.getStringValue(1));
    assertEquals("boo", authorsSeq.getStringValue(2));
  }

  /**
   * test files in configuration/data directory using attributes. no semantic tests, just see if some problematic
   * documents are processed without error.
   *
   * @throws Exception
   *           test failed
   */
  public void testDataDirAttribute() throws Exception {

    final AnyMap config = createAttributesConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
    final List<String> htmlfiles = ConfigUtils.getConfigEntries(CONFIG_BUNDLE, CONFIG_DATADIR);
    for (final String filename : htmlfiles) {
      if (!filename.startsWith(".")) { // exclude .svn directory.
        final String id = createBlackboardRecord("htmltotext", filename);
        final InputStream htmlStream = ConfigUtils.getConfigStream(CONFIG_BUNDLE, CONFIG_DATADIR + "/" + filename);
        final String htmlString = IOUtils.toString(htmlStream, ATransformationPipelet.ENCODING_ATTACHMENT);
        AnyMap anyMap = getBlackboard().getMetadata(id);
        anyMap.put(pipelet.getInputName(paramAccessor), anyMap.getFactory().createStringValue(htmlString));
        pipelet.process(getBlackboard(), new String[] { id });
        anyMap = getBlackboard().getMetadata(id);
        final String textString = anyMap.getStringValue(pipelet.getOutputName(paramAccessor));
        assertNotNull(filename + ": null result", textString);
        _log.info(filename + ": " + textString);
      }
    }
  }

  /**
   * test files in configuration/data directory using attachments. no semantic tests, just see if some problematic
   * documents are processed without error.
   *
   * @throws Exception
   *           test failed
   */
  public void testDataDirAttachment() throws Exception {
    final AnyMap config = createAttachmentsConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
    final List<String> htmlfiles = ConfigUtils.getConfigEntries(CONFIG_BUNDLE, CONFIG_DATADIR);
    for (final String filename : htmlfiles) {
      if (!filename.startsWith(".")) { // exclude .svn directory.
        final String id = createBlackboardRecord("htmltotext", filename);
        final InputStream htmlStream = ConfigUtils.getConfigStream(CONFIG_BUNDLE, CONFIG_DATADIR + "/" + filename);
        getBlackboard().setAttachmentFromStream(id, pipelet.getInputName(paramAccessor), htmlStream);
        pipelet.process(getBlackboard(), new String[] { id });
        final byte[] textBytes = getBlackboard().getAttachmentAsBytes(id, pipelet.getOutputName(paramAccessor));
        assertNotNull(filename + ": null result", textBytes);
        final String textString = new String(textBytes, ATransformationPipelet.ENCODING_ATTACHMENT);
        _log.info(filename + ": " + textString);
      }
    }
  }

  /** tests error handling in case of single record failures. */
  public void testRobustness() throws Exception {
    final AnyMap config = createAttributesConfiguration();
    final HtmlToTextPipelet pipelet = createPipelet(config);
    doRobustnessTestStringInput(pipelet, SourceType.ATTRIBUTE, "<html>Hello World!</html>", config);
  }

  /**
   * tests extraction of CDATA section.
   */
  public void testCData() throws Exception {
    final AnyMap config = createAttributesConfiguration();
    final String id = createBlackboardRecord("htmltotext", "key");
    AnyMap anyMap = getBlackboard().getMetadata(id);
    final String htmlContent = "<html> <![CDATA[this is the cdata text]]> </html>";
    {
      config.put("keepCdata", false);
      final HtmlToTextPipelet pipelet = createPipelet(config);
      final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
      anyMap.put(pipelet.getInputName(paramAccessor), anyMap.getFactory().createStringValue(htmlContent));
      pipelet.process(getBlackboard(), new String[] { id });
      anyMap = getBlackboard().getMetadata(id);
      final String extractedText = anyMap.getStringValue(pipelet.getOutputName(paramAccessor));
      assertEquals("", extractedText.trim());
    }
    {
      config.put("keepCdata", true);
      final HtmlToTextPipelet pipelet = createPipelet(config);
      final ParameterAccessor paramAccessor = new ParameterAccessor(getBlackboard(), config);
      anyMap.put(pipelet.getInputName(paramAccessor), anyMap.getFactory().createStringValue(htmlContent));
      pipelet.process(getBlackboard(), new String[] { id });
      anyMap = getBlackboard().getMetadata(id);
      final String extractedText = anyMap.getStringValue(pipelet.getOutputName(paramAccessor));
      assertEquals("this is the cdata text", extractedText.trim());
    }
  }

  /**
   * @return a configuration for a remove header test
   */
  private AnyMap createConfigurationForRemoveHeaders() {
    final AnyMap configuration = createAttachmentsAttributesConfiguration();
    configuration.put("removeContentTags", "h1,h2,h3,h4");
    return configuration;
  }

  /**
   * @return a configuration for a metadata test
   */
  private AnyMap createConfigurationForMetadata() {
    final AnyMap configuration = createAttachmentsAttributesConfiguration();
    configuration.put("meta:author", "authors");
    configuration.put("meta:keywords", "keywords");
    return configuration;
  }
}
