/*********************************************************************************************************************
 * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web;

import java.io.InputStream;
import java.util.Iterator;

import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ContentFetcher;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.compounds.CompoundExtractorException;
import org.eclipse.smila.importing.compounds.ExtractorWorkerBase;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.DeltaHash;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.taskworker.TaskContext;

/** Compound extractor worker to use in web crawling workflows. */
public class WebExtractorWorker extends ExtractorWorkerBase {
  /** name of worker. */
  public static final String NAME = "webExtractor";

  /** reference to Fetcher service. */
  private Fetcher _fetcher;

  /** {@inheritDoc} */
  @Override
  public String getName() {
    return NAME;
  }

  /** {@inheritDoc} */
  @Override
  protected Iterator<Record> invokeExtractor(final CompoundExtractor extractor, final Record compoundRecord,
    final InputStream compoundContent, final TaskContext taskContext) throws CompoundExtractorException {
    final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
    // the web crawler has already mapped this record, so we have to read the mapped attributes
    final String url =
      compoundRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0));
    final String mimeType =
      compoundRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE).get(0));
    return extractor.extract(compoundContent, url, mimeType, WebCrawlerConstants.ATTACHMENT_CONTENT);
  }

  /** {@inheritDoc} */
  @Override
  protected Record convertRecord(final Record compoundRecord, final Record extractedRecord,
    final TaskContext taskContext) {
    final String dataSource = compoundRecord.getSource();
    final Record convertedRecord;
    if (extractedRecord.getMetadata().containsKey(CompoundExtractor.KEY_IS_ROOT_COMPOUND_RECORD)) {
      // it's the compound's record.
      convertedRecord = compoundRecord;
    } else {
      // it's a record extracted from a compound
      convertedRecord =
        extractedRecord.getFactory().createRecord(dataSource + ":" + extractedRecord.getId(), dataSource);
      copySetToStringAttribute(extractedRecord, CompoundExtractor.KEY_COMPOUNDS, convertedRecord,
        WebCrawlerConstants.ATTRIBUTE_URL, "/"); // use compounds as prefix for URL of extracted record
      concatAttributeValues(extractedRecord, CompoundExtractor.KEY_FILE_NAME, convertedRecord,
        WebCrawlerConstants.ATTRIBUTE_URL, "/"); // add file name to URL of extracted record
    }
    copyAttachment(extractedRecord, convertedRecord, WebCrawlerConstants.ATTACHMENT_CONTENT);
    copyAttribute(extractedRecord, CompoundExtractor.KEY_SIZE, convertedRecord, WebCrawlerConstants.ATTRIBUTE_SIZE);

    // fallback for last modified: set the last modification date of the compound record if the
    // extracted record does not provide an own value:
    copyAttribute(compoundRecord, WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, convertedRecord,
      WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED);
    copyAttribute(extractedRecord, CompoundExtractor.KEY_TIME, convertedRecord,
      WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED);

    DeltaHash.calculate(convertedRecord);
    return convertedRecord;
  }

  /**
   * Filters applied to extracted records:
   * <ul>
   * <li>urlPatterns (to the name of the extracted file).</li>
   * </ul>
   */
  @Override
  protected boolean filterRecord(final Record record, final TaskContext taskContext) {
    final AnyMap filterParams = taskContext.getTaskParameters().getMap(ImportingConstants.TASK_PARAM_FILTERS);
    if (filterParams != null) {
      final FilterConfiguration filterConfiguration = new FilterConfiguration(filterParams);
      if (record.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_URL)
        && !filterConfiguration.getUrlPatternMatcher().matches(
          record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL))) {
        return false;
      }
    }
    return true;
  }

  /** {@inheritDoc} */
  @Override
  protected void mapRecord(final Record record, final TaskContext taskContext) {
    final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
    mapper.mapNames(record, WebCrawlerConstants.PROPERTY_NAMES);
  }

  /** {@inheritDoc} */
  @Override
  protected ContentFetcher getContentFetcher() {
    return _fetcher;
  }

  /** DS service reference injection method. */
  public void setFetcher(final Fetcher fetcher) {
    _fetcher = fetcher;
  }

  /** DS service reference removal method. */
  public void unsetFetcher(final Fetcher fetcher) {
    if (_fetcher == fetcher) {
      _fetcher = null;
    }
  }
}
