package org.eclipse.smila.importing.crawler.feed;

import java.util.Collection;
import java.util.UUID;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.ImportingException;
import org.eclipse.smila.importing.util.RecordOutputHandler;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;

public class FeedCrawlerWorker implements Worker {

  /** Name of the worker, used in worker description and workflows. */
  public static final String NAME = "feedCrawler";

  /** name of input slot containing the crawled records. */
  public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";

  /** Name of the task parameter that contains the feed URL(s) (may be more than one). */
  public static final String TASK_PARAM_FEED_URL = "feedUrls";

  /** Maximum number of records in one bulk object. */
  public static final String TASK_PARAM_MAX_RECORDS_PER_BULK = "maxRecordsPerBulk";

  /** feed entry properties used for delta hash. */
  public static final String TASK_PARAM_DELTA_PROPERTIES = "deltaProperties";

  /** default: write up to 1000 records to one output bulk. */
  public static final Long MAX_RECORDS_PER_BULK_DEFAULT = 1000L;

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  @Override
  public String getName() {
    return NAME;
  }

  @Override
  public void perform(final TaskContext taskContext) throws Exception {
    final FeedCrawlingContext crawlContext = new FeedCrawlingContext(taskContext);
    final RecordOutputHandler recordOutput =
      new RecordOutputHandler(taskContext.getOutputs(), crawlContext.getMaxRecordsPerBulk(),
        OUTPUT_SLOT_CRAWLED_RECORDS);
    String errorMessage = "";
    boolean success = false;
    for (final Any url : crawlContext.getFeedUrls()) {
      final String feedUrl = url.toString();
      try {
        crawl(crawlContext, feedUrl, recordOutput);
        success = true; // at least one feed url was successfully crawled 
      } catch (final Exception e) {
        final String m = "Error while crawling feed '" + feedUrl + "': " + e.getMessage() + ". ";
        _log.warn(m);
        errorMessage = errorMessage + m;
      }
    }
    if (!success) {
      // none of the feed urls were successfully crawled
      throw new ImportingException(errorMessage);
    }
  }

  /** read given feed, map feed entries to records and write them to output. */
  private void crawl(final FeedCrawlingContext crawlContext, final String feedUrl,
    final RecordOutputHandler recordOutput) throws Exception {
    if (_log.isInfoEnabled()) {
      _log.info("Crawling feed " + feedUrl);
    }
    final RomeFeedReader feedReader = new RomeFeedReader();
    final Collection<Record> results = feedReader.readFeed(feedUrl);
    for (final Record record : results) {
      augmentRecord(crawlContext, feedUrl, record);
      crawlContext.getMapper().mapNames(record, FeedProperties.ALL_PROPS);
      recordOutput.writeRecord(record);
    }
  }

  /** add id, source and delta hash to record. */
  private Record augmentRecord(final FeedCrawlingContext crawlContext, final String feedUrl, final Record record)
    throws ImportingException {
    record.setId(createId(crawlContext, record));
    record.setSource(crawlContext.getDataSource());
    record.getMetadata().put(ImportingConstants.ATTRIBUTE_DELTA_HASH, createDeltaHash(crawlContext, record));
    return record;
  }

  /** create record id. */
  private String createId(final FeedCrawlingContext crawlContext, final Record record) throws ImportingException {
    final String feedEntryUri = record.getMetadata().getStringValue(FeedProperties.FEED_ENTRY_URI);
    // "How the entry URI maps to a concrete feed type (RSS or Atom) depends on
    // the concrete feed type. This is explained in detail in Rome documentation",
    // see http://wiki.java.net/twiki/bin/view/Javawsxml/Rome05URIMapping
    return crawlContext.getDataSource() + ":" + feedEntryUri;
  }

  /** create delta hash for record. */
  private String createDeltaHash(final FeedCrawlingContext crawlContext, final Record record)
    throws ImportingException {
    final Any deltaProps = crawlContext.getDeltaProperties();
    if (deltaProps == null) {
      // no delta properties configured -> force update.
      return UUID.randomUUID().toString();
    }
    final StringBuilder deltaBuilder = new StringBuilder();
    boolean success = false;
    for (final Any propAny : deltaProps) {
      final String prop = propAny.toString();
      if (record.getMetadata().containsKey(prop)) {
        final Any any = record.getMetadata().getValue(prop);
        if (any.isValue()) {
          deltaBuilder.append(any.asValue().asString());
          success = true; // at least one delta property could be successfully added
        } else {
          _log.warn("Couldn't create delta hash, property '" + prop + "' has no simple value, value type is "
            + any.getValueType() + ". Forcing update.");
        }
      } else {
        _log.warn("Couldn't create delta hash, property '" + prop + "' wasn't set. Forcing update.");
      }
    }
    if (!success) {
      return UUID.randomUUID().toString(); // force update
    }
    return deltaBuilder.toString();
  }

}
