/*******************************************************************************
 * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH.
 * All rights reserved.
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License v1.0 which accompanies this distribution,
 * and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Tobias Liefke - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.processing.pipelets.boilerpipe;

import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Value;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.parameters.MissingParameterException;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
import org.eclipse.smila.processing.util.ProcessingConstants;
import org.eclipse.smila.processing.util.ResultCollector;
import org.xml.sax.InputSource;

import de.l3s.boilerpipe.BoilerpipeFilter;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.sax.BoilerpipeSAXInput;

/**
 * Integrates the <a href="http://code.google.com/p/boilerpipe/">Boilerpipe</a> library into SMILA.
 * 
 * @author Tobias Liefke
 */
public class BoilerpipePipelet extends ATransformationPipelet {

  /** Name of the property that contains the attribute with the encoding of the input attachment. */
  public static final String ENCODING_ATTRIBUTE_PROPERTY = "encodingAttribute";

  /**
   * Name of the property that contains the encoding of the input attachment, fallback for
   * {@link #ENCODING_ATTRIBUTE_PROPERTY}.
   */
  public static final String ENCODING_PROPERTY = "defaultEncoding";

  /** Name of the property that contains the selected boilerpipe filter class. */
  public static final String FILTER_PROPERTY = "filter";

  /** The local logger. */
  private final Log _logger = LogFactory.getLog(getClass());

  /** The pipelet configuration. */
  private AnyMap _configuration;

  /** The configured filters. */
  private Collection<? extends BoilerpipeFilter> _filters;

  @Override
  public void configure(final AnyMap configuration) throws ProcessingException {
    this._configuration = configuration;

    // Find filters
    final ParameterAccessor paramAccessor = new ParameterAccessor(configuration);
    final Collection<String> filterNames = paramAccessor.getParameters(FILTER_PROPERTY);
    if (filterNames.isEmpty()) {
      _logger.debug("Using default boilerpipe filter: de.l3s.boilerpipe.extractors.ArticleExtractor");
      _filters = Collections.singleton(ArticleExtractor.INSTANCE);
    } else {
      final Collection<BoilerpipeFilter> filters = new ArrayList<BoilerpipeFilter>();
      for (String filterName : filterNames) {
        try {
          BoilerpipeFilter filter;
          try {
            // Check if the given filter is the class name
            filter = (BoilerpipeFilter) Class.forName(filterName).newInstance();
          } catch (ClassNotFoundException e) {
            // Check if the given filter is the field or method name
            final int dot = filterName.lastIndexOf('.');
            if (dot < 0) {
              throw e;
            }
            final Class<?> accessor = Class.forName(filterName.substring(0, dot));
            if (filterName.endsWith("()")) {
              final Method method = accessor.getMethod(filterName.substring(dot + 1, filterName.length() - 2));
              filter = (BoilerpipeFilter) method.invoke(null);
            } else {
              filter = (BoilerpipeFilter) accessor.getField(filterName.substring(dot + 1)).get(null);
            }
          }
          if (filter != null) {
            _logger.debug("Using boilerpipe filter: " + filter.getClass());
            filters.add(filter);
          }
        } catch (Exception e) {
          throw new ProcessingException("Could not access boilerpipe filter: " + filterName, e);
        }
      }
      this._filters = filters;
    }

  }

  @Override
  public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException {
    final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _configuration);
    final ResultCollector resultCollector =
      new ResultCollector(paramAccessor, _logger, ProcessingConstants.DROP_ON_ERROR_DEFAULT);
    for (final String id : recordIds) {
      paramAccessor.setCurrentRecord(id);
      _logger.debug("Processing record: " + id);
      try {
        final InputSource source = extractInputSource(blackboard, paramAccessor, id);
        if (source != null) {
          final TextDocument document = new BoilerpipeSAXInput(source).getTextDocument();
          boolean changed = false;
          for (BoilerpipeFilter filter : _filters) {
            if (filter.process(document) && !changed) {
              changed = true;
            }
          }
          if (changed) {
            storeResult(blackboard, id, document.getContent(), paramAccessor);
          } else {
            _logger.debug("None of the configured filters produced a result");
          }
        } else {
          _logger.debug("No input found");
        }
        resultCollector.addResult(id);
      } catch (final Exception e) {
        resultCollector.addFailedResult(id, e);
      }
    }
    return resultCollector.getResultIds();
  }

  /**
   * Determine the input source for BoilerPipe extraction.
   * 
   * @param blackboard
   *          the current blackboard
   * @param paramAccessor
   * @param id
   *          the id of the current record
   * @return the input source or {@code null} if none was found
   */
  private InputSource extractInputSource(final Blackboard blackboard, final ParameterAccessor paramAccessor,
    final String id) throws MissingParameterException, BlackboardAccessException, UnsupportedEncodingException {
    final String inputName = getInputName(paramAccessor);
    if (isReadFromAttribute(getInputType(paramAccessor))) {
      final Any inputAny = blackboard.getMetadata(id).get(inputName);
      if ((inputAny != null) && inputAny.isValue()) {
        final String value = ((Value) inputAny).asString();
        if (value != null) {
          return new InputSource(new StringReader(value));
        }
      }
    } else if (blackboard.hasAttachment(id, inputName)) {
      String encoding = null;
      final String encodingAttribute = paramAccessor.getParameter(ENCODING_ATTRIBUTE_PROPERTY, null);
      if (encodingAttribute != null) {
        encoding = blackboard.getMetadata(id).getStringValue(encodingAttribute);
      }
      if (encoding == null) {
        encoding = paramAccessor.getParameter(ENCODING_PROPERTY, null);
      }
      if (encoding == null) {
        return new InputSource(blackboard.getAttachmentAsStream(id, inputName));
      } else {
        _logger.debug("Using encoding: " + encoding);
        return new InputSource(new InputStreamReader(blackboard.getAttachmentAsStream(id, inputName), encoding));
      }
    }
    return null;
  }

}
