/***********************************************************************************************************************
 * Copyright (c) 2008,2011 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the 
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this 
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation               
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.taskworker.input.RecordInput;
import org.eclipse.smila.taskworker.output.RecordOutput;

/** Fetches binary content from URL and stores the content as record attachment. */
public class WebFetcherWorker implements Worker {

  /** worker's name. */
  private static final String WORKER_NAME = "webFetcher";

  /** input slot name. */
  private static final String INPUT_SLOT = "linksToFetch";

  /** output slot name. */
  private static final String OUTPUT_SLOT = "fetchedLinks";

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  /** reference to Fetcher service. */
  private Fetcher _fetcher;

  @Override
  public String getName() {
    return WORKER_NAME;
  }

  @Override
  public void perform(final TaskContext taskContext) throws Exception {
    final RecordInput recordInput = taskContext.getInputs().getAsRecordInput(INPUT_SLOT);
    final RecordOutput recordOutput = taskContext.getOutputs().getAsRecordOutput(OUTPUT_SLOT);
    Record record;
    do {
      record = recordInput.getRecord();
      if (record != null) {
        if (!record.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
          // we have no content attachment yet, so we fetch the content here...
          fetchContent(record, taskContext);
        }
        recordOutput.writeRecord(record);
        if (_log.isDebugEnabled()) {
          _log.debug("added record " + record.getId());
        }
      }
    } while (record != null);
  }

  /** fetch content and measure time as "duration...fetchContent". */
  private void fetchContent(final Record record, final TaskContext taskContext) throws WebCrawlerException {
    final long time = taskContext.getTimestamp();
    try {
      if (_log.isDebugEnabled()) {
        _log.debug("fetching content for record " + record.getId());
      }
      _fetcher.fetch(record, taskContext.getTaskParameters(), taskContext.getLog());
    } catch (final WebCrawlerException e) {
      taskContext.getLog().warn("Failed to fetch link for record " + record.getId(), e);
      if (e.isRecoverable()) {
        throw e; // repeat the whole task for recoverable errors, otherwise continue processing
      }
    } finally {
      taskContext.measureTime("fetchContent", time);
    }
  }

  /** DS service reference injection method. */
  public void setFetcher(final Fetcher fetcher) {
    _fetcher = fetcher;
  }

  /** DS service reference removal method. */
  public void unsetFetcher(final Fetcher fetcher) {
    if (_fetcher == fetcher) {
      _fetcher = null;
    }
  }

}
