/*********************************************************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.file;

import java.io.File;
import java.util.Collection;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.taskworker.input.Inputs;
import org.eclipse.smila.taskworker.input.RecordInput;
import org.eclipse.smila.taskworker.output.Outputs;
import org.eclipse.smila.taskworker.output.RecordOutput;

/**
 * Worker implementation that performs file crawling.
 * 
 * @author stuc07
 * 
 */
public class FileCrawlerWorker implements Worker {

  /** Name of the worker, used in worker description and workflows. */
  public static final String NAME = "fileCrawler";

  public static final String INPUT_SLOT_DIRS_TO_CRAWL = "directoriesToCrawl";

  public static final String OUTPUT_SLOT_DIRS_TO_CRAWL = "directoriesToCrawl";

  public static final String OUTPUT_SLOT_FILES_TO_CRAWL = "filesToCrawl";

  public static final int MAX_FILES_PER_BULK = 1000;

  protected final Log _log = LogFactory.getLog(getClass());

  private final FileToRecordConverter _converter = new FileToRecordConverter(DataFactory.DEFAULT);

  private FileCrawlerService _fileCrawler;

  public void setFileCrawlerService(final FileCrawlerService fileCrawler) {
    _fileCrawler = fileCrawler;
  }

  public void unsetFileCrawlerService(final FileCrawlerService fileCrawler) {
    if (_fileCrawler == fileCrawler) {
      _fileCrawler = null;
    }
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void perform(final TaskContext taskContext) throws Exception {
    try {
      final Inputs inputs = taskContext.getInputs();
      if (inputs.getDataObjectCount(INPUT_SLOT_DIRS_TO_CRAWL) == 0) {
        // crawler is triggered by task generator, read input from task instead of
        crawl(taskContext);
      } else {
        final RecordInput directoryInput = inputs.getAsRecordInput(INPUT_SLOT_DIRS_TO_CRAWL);
        Record record = directoryInput.getRecord();
        while (record != null) {
          try {
            final String dirName = record.getMetadata().getStringValue(FileToRecordConverter.PROPERTY_FILE_FOLDER);
            if (dirName == null || dirName.trim().isEmpty()) {
              taskContext.getLog().error(
                "Failed to crawl directory. Attribute '" + FileToRecordConverter.PROPERTY_FILE_FOLDER
                  + "' of record " + record.getId() + " is null or empty");
            } else {
              crawl(dirName, taskContext.getOutputs(), record.getSource());
            }
          } catch (final Exception e) {
            taskContext.getLog().error(
              "Failed to crawl directory '"
                + record.getMetadata().getStringValue(FileToRecordConverter.PROPERTY_FILE_FOLDER) + "' of record "
                + record.getId(), e);
          }
          record = directoryInput.getRecord();
        }
      }
    } catch (final Exception e) {
      _log.error(e);
      throw e;
    }
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public String getName() {
    return NAME;
  }

  private void crawl(final TaskContext taskContext) throws Exception {
    final AnyMap taskParams = taskContext.getTaskParameters();
    final String dataSource = taskParams.getStringValue(ImportingConstants.TASK_PARAM_DATA_SOURCE);
    if (dataSource == null || dataSource.trim().length() == 0) {
      throw new IllegalArgumentException("Parameter '" + ImportingConstants.TASK_PARAM_DATA_SOURCE + "' of task "
        + taskContext.getTask().getTaskId() + " is null or empty");
    }
    final String dirName = taskParams.getStringValue(ImportingConstants.TASK_PARAM_ROOT_FOLDER);
    if (dirName == null || dirName.trim().length() == 0) {
      throw new IllegalArgumentException("Parameter '" + ImportingConstants.TASK_PARAM_ROOT_FOLDER + "' of task "
        + taskContext.getTask().getTaskId() + " is null or empty");
    }
    crawl(dirName, taskContext.getOutputs(), dataSource);
  }

  private void crawl(final String dirName, final Outputs outputs, final String dataSource) throws Exception {
    final File dir = new File(dirName);
    if (!dir.isDirectory()) {
      throw new IllegalArgumentException(FileToRecordConverter.PROPERTY_FILE_FOLDER + " '" + dirName
        + "' is not a directory");
    }

    int subDirCount = 0;
    final Collection<File> subDirectories = _fileCrawler.listDirectories(dir);
    for (final File subDir : subDirectories) {
      final RecordOutput directoryOutput = outputs.getAsRecordOutput(OUTPUT_SLOT_DIRS_TO_CRAWL, subDirCount);
      final Record directoryRecord = _converter.dirToRecord(subDir, dataSource);
      directoryOutput.writeRecord(directoryRecord);
      directoryOutput.commit();
      subDirCount++;
      _log.debug("added directory " + subDir.getAbsolutePath());
    }

    int fileCount = 0;
    int recordOutputIndex = 0;
    RecordOutput recordOutput = outputs.getAsRecordOutput(OUTPUT_SLOT_FILES_TO_CRAWL, recordOutputIndex);
    final Collection<File> files = _fileCrawler.listFiles(dir);
    for (final File file : files) {
      final Record fileRecord = _converter.fileToRecord(file, dataSource, false);
      recordOutput.writeRecord(fileRecord);
      fileCount++;
      _log.debug("added file " + file.getAbsolutePath());
      if (fileCount % MAX_FILES_PER_BULK == 0) {
        recordOutput.commit();
        recordOutputIndex++;
        recordOutput = outputs.getAsRecordOutput(OUTPUT_SLOT_FILES_TO_CRAWL, recordOutputIndex);
      }
    }
    _log.info("directory " + dir.getAbsolutePath() + " contained " + fileCount + " files and " + subDirCount
      + " directories.");
  }
}
