/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web;

import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.VisitedLinksException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.taskmanager.Task;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.taskworker.input.Inputs;
import org.eclipse.smila.taskworker.input.RecordInput;
import org.eclipse.smila.taskworker.output.Outputs;
import org.eclipse.smila.taskworker.output.RecordOutput;
import org.eclipse.smila.utils.MaybeRecoverableException;

/** Worker for Web crawling. */
public class WebCrawlerWorker implements Worker {
  /** Name of the worker, used in worker description and workflows. */
  public static final String NAME = "webCrawler";

  /** name of input slot containing the links to crawl. */
  public static final String INPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";

  /** name of output slot containing the links to crawl. */
  public static final String OUTPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";

  /** name of input slot containing the crawled records. */
  public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";

  /** number of links to write into a single output bulk. */
  private static final int LINKS_PER_BULK = 10;

  /**
   * number of records to write into a single output bulk. However, usually the worker will produce 1 record output bulk
   * per link input bulk because for each input link at most one output record will be produced.
   */
  private static final int RECORDS_PER_BULK = 100;

  /** dummy input bulk Id used in initial crawl task for marking links as visited. */
  private static final String BULK_ID_FOR_INITIAL_TASK = "initial";

  /** reference to VisitedLinks service. */
  private VisitedLinksService _visitedLinks;

  /** reference to Fetcher service. */
  private Fetcher _fetcher;

  /** reference to LinkExtractor service. */
  private LinkExtractor _linkExtractor;

  /** reference to LinkFilter service. */
  private LinkFilter _linkFilter;

  /** reference to RecordProducer service. */
  private RecordProducer _recordProducer;

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  /**
   * holds the current output bulks and creates new output bulks if the specified number or links or records have been
   * written.
   */
  private static class OutputBulks {

    /** task context outputs manager. */
    private final Outputs _outputs;

    /** maximum number of links to write to a single bulk. */
    private final int _linksPerBulk;

    /** maximum number of records to write to a single bulk. */
    private final int _recordsPerBulk;

    /** current linksToCrawl bulk. */
    private RecordOutput _linksToCrawl;

    /** index of current linksToCrawl bulk. */
    private int _linksToCrawlBulkIndex;

    /** current crawledRecords bulk. */
    private RecordOutput _crawledRecords;

    /** index of current crawledRecords bulk. */
    private int _crawledRecordsBulkIndex;

    /** initialize instance for processing of one task. */
    OutputBulks(final Outputs outputs, final int linksPerBulk, final int recordsPerBulk) {
      _outputs = outputs;
      _linksPerBulk = linksPerBulk;
      _recordsPerBulk = recordsPerBulk;
    }

    /** write a record to the linksToCrawl bulk, start a new bulk if necessary. */
    void addLinkToCrawl(final Record record) throws WebCrawlerException {
      try {
        if (_linksToCrawl == null) {
          _linksToCrawl = _outputs.getAsRecordOutput(OUTPUT_SLOT_LINKS_TO_CRAWL);
        } else if (_linksToCrawl.getRecordCount() >= _linksPerBulk) {
          _linksToCrawl.commit();
          _linksToCrawlBulkIndex++;
          _linksToCrawl = _outputs.getAsRecordOutput(OUTPUT_SLOT_LINKS_TO_CRAWL, _linksToCrawlBulkIndex);
        }
        _linksToCrawl.writeRecord(record);
      } catch (final Exception ex) {
        throw new WebCrawlerException("Error writing to linksToCrawl bulk", ex);
      }
    }

    /** write a record to the crawledRecords bulk, start a new bulk if necessary. */
    void addCrawledRecord(final Record record) throws WebCrawlerException {
      try {
        if (_crawledRecords == null) {
          _crawledRecords = _outputs.getAsRecordOutput(OUTPUT_SLOT_CRAWLED_RECORDS);
        } else if (_crawledRecords.getRecordCount() >= _recordsPerBulk) {
          _crawledRecords.commit();
          _crawledRecordsBulkIndex++;
          _crawledRecords = _outputs.getAsRecordOutput(OUTPUT_SLOT_LINKS_TO_CRAWL, _crawledRecordsBulkIndex);
        }
        _crawledRecords.writeRecord(record);
      } catch (final Exception ex) {
        throw new WebCrawlerException("Error writing to crawledRecords bulk", ex);
      }
    }
  }

  @Override
  public String getName() {
    return NAME;
  }

  @Override
  public void perform(final TaskContext taskContext) throws Exception {
    final String source = taskContext.getTaskParameters().getStringValue(ImportingConstants.TASK_PARAM_DATA_SOURCE);
    final String jobRunId = taskContext.getTask().getProperties().get(Task.PROPERTY_JOB_RUN_ID);

    final Inputs inputs = taskContext.getInputs();
    if (inputs.getDataObjectCount(INPUT_SLOT_LINKS_TO_CRAWL) == 0) {
      initiateCrawling(source, taskContext, jobRunId);
    } else {
      final RecordInput linksToCrawl = inputs.getAsRecordInput(INPUT_SLOT_LINKS_TO_CRAWL);
      crawlLinkRecords(source, linksToCrawl, taskContext, jobRunId);
    }
  }

  /** start crawling from task parameters. */
  private void initiateCrawling(final String source, final TaskContext taskContext, final String jobRunId)
    throws MaybeRecoverableException {
    // put each link to an own bulk to improving scaling.
    final OutputBulks outputBulks = new OutputBulks(taskContext.getOutputs(), 1, RECORDS_PER_BULK);
    final Record initialLinkRecord = DataFactory.DEFAULT.createRecord();
    final String startUrl = taskContext.getTaskParameters().getStringValue(ImportingConstants.TASK_PARAM_START_URL);
    setUrl(initialLinkRecord, startUrl);
    _visitedLinks.clearSource(source);
    crawlLinkRecord(source, initialLinkRecord, outputBulks, jobRunId, BULK_ID_FOR_INITIAL_TASK, taskContext);
  }

  /** crawl links from input records. */
  private void crawlLinkRecords(final String source, final RecordInput linksToCrawl, final TaskContext taskContext,
    final String jobRunId) throws ObjectStoreException, IOException, MaybeRecoverableException {
    // TODO make linksPerBulk depend on crawl depth?
    final OutputBulks outputBulks = new OutputBulks(taskContext.getOutputs(), LINKS_PER_BULK, RECORDS_PER_BULK);
    final Set<String> urlVisitedInThisTask = new HashSet<String>();
    final String inputBulkId = linksToCrawl.getObjectName();
    Record record = linksToCrawl.getRecord();
    while (record != null) {
      if (hasNotBeenVisited(source, record, urlVisitedInThisTask, jobRunId, taskContext, inputBulkId)) {
        crawlLinkRecord(source, record, outputBulks, jobRunId, inputBulkId, taskContext);
        urlVisitedInThisTask.add(getUrl(record));
      }
      record = linksToCrawl.getRecord();
    }
  }

  /**
   * check if the URL in a record has already been visited: either in this task by checking local list of URLs visited
   * in this task, or by a worker processing a different crawl task, by checking the global {@link VisitedLinksService}
   * instance.
   */
  private boolean hasNotBeenVisited(final String source, final Record record,
    final Collection<String> urlVisitedInThisTask, final String jobRunId, final TaskContext taskContext,
    final String inputBulkId) throws VisitedLinksException {
    final String url = getUrl(record);
    debugLogUrl("Check if visited", record);
    boolean notVisited = false;
    if (!urlVisitedInThisTask.contains(url)) {
      // not yet visited in this task
      notVisited = isNotVisitedTimed(source, record, jobRunId, inputBulkId, taskContext);
      debugLogUrl("Will visit: " + notVisited, record);
    } else {
      debugLogUrl("Duplicate URL in task", record);
    }
    return notVisited;
  }

  /**
   * check if an url in a record is not narked as visited in {@link VisitedLinksService} instance. measure time as
   * "duration...checkVisitedLinks"
   */
  private boolean isNotVisitedTimed(final String source, final Record record, final String jobRunId,
    final String inputBulkId, final TaskContext taskContext) throws VisitedLinksException {
    final String url = getUrl(record);
    debugLogUrl("Check if visited", record);
    final long time = taskContext.getTimestamp();
    try {
      return !_visitedLinks.isVisited(source, url, jobRunId, inputBulkId);
    } finally {
      taskContext.measureTime("checkVisitedLinks", time);
    }
  }

  /** crawl a link represented by one record: fetch metadata and content, extract links, produce record. */
  private void crawlLinkRecord(final String source, final Record linkRecord, final OutputBulks outputBulks,
    final String jobRunId, final String inputBulkId, final TaskContext taskContext)
    throws MaybeRecoverableException {
    try {
      invokeFetcherTimed(linkRecord, taskContext);
      // Check again after fetching to prevent duplicates when an URL is processed by two workers at the same time.
      // Additionally, this marks the URL as visited in the initial crawl task.
      if (isNotVisitedTimed(source, linkRecord, jobRunId, inputBulkId, taskContext)) {
        extractAndFilterLinks(linkRecord, outputBulks, taskContext);
        produceAndWriteRecords(linkRecord, outputBulks, taskContext);
      }
    } catch (final MaybeRecoverableException ex) {
      if (ex.isRecoverable() || BULK_ID_FOR_INITIAL_TASK.equals(inputBulkId)) {
        throw ex; // abort this task and retry later or fail job run, if initial task fails fatally
      } else {
        logNonRecoverableError(source, linkRecord, taskContext, ex);
      }
    } catch (final RuntimeException ex) {
      logNonRecoverableError(source, linkRecord, taskContext, ex);
    }
  }

  /**
   * produce the record to be processed by SMILA from the crawled link record and write them to the records output bulk.
   */
  private void produceAndWriteRecords(final Record linkRecord, final OutputBulks outputBulks,
    final TaskContext taskContext) throws WebCrawlerException {
    final Collection<Record> crawledRecords = produceRecordsTimed(linkRecord, taskContext);
    for (final Record crawledRecord : crawledRecords) {
      outputBulks.addCrawledRecord(crawledRecord);
    }
  }

  /** extract and filter links from content of the fetched web resource and write them to the links output bulk. */
  private void extractAndFilterLinks(final Record linkRecord, final OutputBulks outputBulks,
    final TaskContext taskContext) throws WebCrawlerException {
    if (linkRecord.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
      final Collection<Record> extractedLinks = extractLinksTimed(linkRecord, taskContext);
      final Collection<Record> filteredLinks = filterLinksTimed(extractedLinks, linkRecord, taskContext);
      for (final Record outgoingLink : filteredLinks) {
        outputBulks.addLinkToCrawl(outgoingLink);
      }
    }
  }

  /** invoke fetcher and measure time as "duration...fetchResource". */
  private void invokeFetcherTimed(final Record linkRecord, final TaskContext taskContext)
    throws WebCrawlerException {
    debugLogUrl("Call fetcher for ", linkRecord);
    final long time = taskContext.getTimestamp();
    try {
      _fetcher.crawl(linkRecord, taskContext.getTaskParameters(), taskContext.getLog());
    } finally {
      taskContext.measureTime("fetchResource", time);
    }
  }

  /** invoke link extractor and measure time as "duration...extractLinks". */
  private Collection<Record> extractLinksTimed(final Record linkRecord, final TaskContext taskContext)
    throws WebCrawlerException {
    debugLogUrl("Extract links from ", linkRecord);
    final long time = taskContext.getTimestamp();
    try {
      return _linkExtractor.extractLinks(linkRecord, taskContext.getTaskParameters(), taskContext.getLog());
    } finally {
      taskContext.measureTime("extractLinks", time);
    }
  }

  /** invoke link filter and measure time as "duration...filterLink". */
  private Collection<Record> filterLinksTimed(final Collection<Record> extractedLinks, final Record sourceLink,
    final TaskContext taskContext) throws WebCrawlerException {
    if (_log.isDebugEnabled()) {
      _log.debug("Filter links " + extractedLinks + " extracted from " + getUrl(sourceLink));
    }
    final long time = taskContext.getTimestamp();
    try {
      final Collection<Record> filteredLinks =
        _linkFilter.filterLinks(extractedLinks, sourceLink, taskContext.getTaskParameters(), taskContext.getLog());
      if (_log.isDebugEnabled()) {
        _log.debug("Remaining links: " + filteredLinks);
      }
      return filteredLinks;
    } finally {
      taskContext.measureTime("filterLink", time);
    }
  }

  /** invoke record producer and measure time as "duration...produceRecords". */
  private Collection<Record> produceRecordsTimed(final Record crawledRecord, final TaskContext taskContext)
    throws WebCrawlerException {
    debugLogUrl("Produce record for ", crawledRecord);
    final long time = taskContext.getTimestamp();
    try {
      return _recordProducer.produceRecords(crawledRecord, taskContext.getTaskParameters(), taskContext.getLog());
    } finally {
      taskContext.measureTime("produceRecords", time);
    }
  }

  /** log URL of record to debug log. */
  private void debugLogUrl(final String message, final Record link) {
    if (_log.isDebugEnabled()) {
      _log.debug(message + " " + getUrl(link));
    }
  }

  /**
   * log a non-recoverable error as a warning to the tasklog instead of throwing it, which would abort the task and
   * cancel the crawl job run.
   */
  private void logNonRecoverableError(final String source, final Record linkRecord, final TaskContext taskContext,
    final Exception ex) {
    taskContext.getLog().warn("Error crawling link " + getUrl(linkRecord) + " in source " + source + ", skipping.",
      ex);
  }

  /** get URL from record. */
  private String getUrl(final Record record) {
    return record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
  }

  /** set URL in record. */
  private void setUrl(final Record record, final String startUrl) {
    record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, startUrl);
  }

  /** DS service reference injection method. */
  public void setVisitedLinks(final VisitedLinksService visitedLinks) {
    _visitedLinks = visitedLinks;
  }

  /** DS service reference removal method. */
  public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
    if (_visitedLinks == visitedLinks) {
      _visitedLinks = null;
    }
  }

  /** DS service reference injection method. */
  public void setFetcher(final Fetcher fetcher) {
    _fetcher = fetcher;
  }

  /** DS service reference removal method. */
  public void unsetFetcher(final Fetcher fetcher) {
    if (_fetcher == fetcher) {
      _fetcher = null;
    }
  }

  /** DS service reference injection method. */
  public void setLinkExtractor(final LinkExtractor linkExtractor) {
    _linkExtractor = linkExtractor;
  }

  /** DS service reference removal method. */
  public void unsetLinkExtractor(final LinkExtractor linkExtractor) {
    if (_linkExtractor == linkExtractor) {
      _linkExtractor = null;
    }
  }

  /** DS service reference injection method. */
  public void setLinkFilter(final LinkFilter linkFilter) {
    _linkFilter = linkFilter;
  }

  /** DS service reference removal method. */
  public void unsetLinkFilter(final LinkFilter linkFilter) {
    if (_linkFilter == linkFilter) {
      _linkFilter = null;
    }
  }

  /** DS service reference injection method. */
  public void setRecordProducer(final RecordProducer recordProducer) {
    _recordProducer = recordProducer;
  }

  /** DS service reference removal method. */
  public void unsetRecordProducer(final RecordProducer recordProducer) {
    if (_recordProducer == recordProducer) {
      _recordProducer = null;
    }
  }
}
