/*********************************************************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.file;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.ImportingException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.util.RecordOutputHandler;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.TaskLog;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.taskworker.input.Inputs;
import org.eclipse.smila.taskworker.input.RecordInput;

/**
 * Worker implementation that performs file crawling.
 * 
 * @author stuc07
 * 
 */
public class FileCrawlerWorker implements Worker {

  /** Name of the worker, used in worker description and workflows. */
  public static final String NAME = "fileCrawler";

  /** name of input slot containing records with directories to crawl. */
  public static final String INPUT_SLOT_DIRS_TO_CRAWL = "directoriesToCrawl";

  /** name of output slot taking the directories to crawl in follow-up tasks. */
  public static final String OUTPUT_SLOT_DIRS_TO_CRAWL = "directoriesToCrawl";

  /** name of output slot taking the file records to process in ETL. */
  public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";

  /** Name of the task parameter that contains the root folder for crawling. */
  public static final String TASK_PARAM_ROOT_FOLDER = "rootFolder";

  /** Maximum number of files in one bulk object. */
  public static final String TASK_PARAM_MAX_FILES_PER_BULK = "maxFilesPerBulk";

  /** Minimum number of files in one bulk object. */
  public static final String TASK_PARAM_MIN_FILES_PER_BULK = "minFilesPerBulk";

  /** number of directories to write to one bulk object. */
  public static final String TASK_PARAM_DIRS_PER_BULK = "directoriesPerBulk";

  /** default: write up to 1000 files to one file bulk. */
  public static final Long MAX_FILES_PER_BULK_DEFAULT = 1000L;

  /** default: don't add files from subdirectories, if current folder has too few files. */
  public static final Long MIN_FILES_PER_BULK_DEFAULT = 0L;

  /** default: one directory per follow-up task. */
  public static final Long DIRS_PER_BULK_DEFAULT = 1L;

  /** dummy input bulk Id used in initial crawl task for marking links as visited. */
  private static final String BULK_ID_FOR_INITIAL_TASK = "initial";

  private final Log _log = LogFactory.getLog(getClass());

  private FileCrawlerService _fileCrawler;

  private VisitedLinksService _visitedLinks;

  private CompoundExtractor _compoundExtractor;

  @Override
  public String getName() {
    return NAME;
  }

  @Override
  public void perform(final TaskContext taskContext) throws Exception {
    try {
      final FileCrawlingContext crawlContext = new FileCrawlingContext(taskContext);
      final RecordOutputHandler fileBulks =
        new RecordOutputHandler(taskContext.getOutputs(), crawlContext.getMaxFilesPerBulk(),
          OUTPUT_SLOT_CRAWLED_RECORDS);
      final Inputs inputs = taskContext.getInputs();
      if (inputs.getDataObjectCount(INPUT_SLOT_DIRS_TO_CRAWL) == 0) {
        // crawler is triggered by task generator, read input from task instead of
        crawlRootFolder(crawlContext, fileBulks);
      } else {
        crawlInputFolders(crawlContext, fileBulks);
      }
    } catch (final Exception e) {
      _log.error(e);
      throw e;
    }
  }

  /**
   * crawl initial root folder as configured in task parameters. Create follow-up tasks for each single subdirectory,
   * don't go into subdirectories to increase file bulk size.
   */
  private void crawlRootFolder(final FileCrawlingContext crawlContext, final RecordOutputHandler fileBulks)
    throws Exception {
    _visitedLinks.clearSource(crawlContext.getDataSource());
    final RecordOutputHandler directoryBulks =
      new RecordOutputHandler(crawlContext.getTaskContext().getOutputs(), 1, OUTPUT_SLOT_DIRS_TO_CRAWL);
    crawl(crawlContext.getRootFolder(), crawlContext, fileBulks, directoryBulks, BULK_ID_FOR_INITIAL_TASK, 0);
  }

  /**
   * crawl folders specified in input bulk records. Apply all output bulk size parameters.
   */
  private void crawlInputFolders(final FileCrawlingContext crawlContext, final RecordOutputHandler fileBulks)
    throws ObjectStoreException, IOException {
    final RecordOutputHandler directoryBulks =
      new RecordOutputHandler(crawlContext.getTaskContext().getOutputs(), crawlContext.getDirectoriesPerBulk(),
        OUTPUT_SLOT_DIRS_TO_CRAWL);
    final RecordInput directoryInput =
      crawlContext.getTaskContext().getInputs().getAsRecordInput(INPUT_SLOT_DIRS_TO_CRAWL);
    Record record = directoryInput.getRecord();
    while (record != null && !crawlContext.getTaskContext().isCanceled()) {
      final String dirName = record.getMetadata().getStringValue(FileCrawlerService.PROPERTY_FILE_FOLDER);
      final TaskLog taskLog = crawlContext.getTaskContext().getLog();
      try {
        if (dirName == null || dirName.trim().isEmpty()) {
          taskLog.error("Failed to crawl directory. Attribute '" + FileCrawlerService.PROPERTY_FILE_FOLDER
            + "' of record " + record.getId() + " is null or empty");
        } else {
          crawl(dirName, crawlContext, fileBulks, directoryBulks, directoryInput.getObjectName(),
            crawlContext.getMinFilesPerBulk());
        }
      } catch (final Exception e) {
        taskLog.warn("Failed to crawl directory '" + dirName + "' of record " + record.getId(), e);
      }
      record = directoryInput.getRecord();
    }
  }

  /** crawl directory and write files and subdirectories to given output handlers. */
  private void crawl(final String dirName, final FileCrawlingContext crawlContext,
    final RecordOutputHandler fileBulks, final RecordOutputHandler directoryBulks, final String inputBulkId,
    final long minFilesPerBulk) throws Exception {
    final Path dir = Paths.get(dirName);
    if (!Files.isDirectory(dir)) {
      throw new IllegalArgumentException(FileCrawlerService.PROPERTY_FILE_FOLDER + " '" + dirName
        + "' is not a directory");
    }
    if (checkAndMarkVisited(dir, crawlContext, inputBulkId)) {
      _log.info("Not crawling into directory " + dir + ", because it has been visited before.");
      return;
    }

    final Collection<Path> subDirectories = createFileOutputBulks(dir, crawlContext, fileBulks);

    if (fileBulks.getFileCount() < minFilesPerBulk) {
      // iterate over sub directories and try to reach minFilesPerBulk
      final Iterator<Path> it = subDirectories.iterator();
      while (it.hasNext() && !crawlContext.getTaskContext().isCanceled()
        && fileBulks.getFileCount() < minFilesPerBulk) {
        final Path subDir = it.next();
        final Collection<Path> subDirDirectories = createFileOutputBulks(subDir, crawlContext, fileBulks);
        createDirectoryOutputBulks(crawlContext.getDataSource(), subDirDirectories, directoryBulks);
        it.remove();
      }
    }
    // create output bulks for remaining directories
    createDirectoryOutputBulks(crawlContext.getDataSource(), subDirectories, directoryBulks);
  }

  /*** write files from records to output bulks, return names of sub-directories. */
  private Collection<Path> createFileOutputBulks(final Path dir, final FileCrawlingContext crawlContext,
    final RecordOutputHandler fileBulks) throws Exception {
    final Collection<Path> subDirectories = new ArrayList<Path>();
    final Collection<Path> filesAndFolders = _fileCrawler.list(dir);
    int fileCount = 0;
    for (final Path file : filesAndFolders) {
      if (Files.isRegularFile(file) && crawlContext.getFilterEvaluator().applyFiltersForCrawledFile(dir, file)) {
        final Record fileRecord = _fileCrawler.fileToRecord(file, crawlContext.getDataSource(), false);
        if (isCompoundRecord(fileRecord)) {
          setIsCompound(fileRecord);
        }
        crawlContext.getMapper().mapNames(fileRecord, _fileCrawler.getFilePropertyNames());
        fileBulks.writeRecord(fileRecord);
        fileCount++;
      } else if (Files.isDirectory(file)
        && crawlContext.getFilterEvaluator().applyFiltersForCrawledFolder(file, crawlContext.getRootFolder())) {
        subDirectories.add(file);
      } else if (!Files.isRegularFile(file) && !Files.isDirectory(file)) {
        // should not happen
        _log.warn("directory " + dir.toRealPath() + " contains object " + file.getFileName()
          + " which is neither a file nor a directory or cannot be accessed.");
      }
    }
    if (_log.isInfoEnabled()) {
      _log.info("directory " + dir.toRealPath() + " contained " + fileCount + " files and " + subDirectories.size()
        + " directories.");
    }

    return subDirectories;
  }

  /** write sub-directories as records to output. */
  private void createDirectoryOutputBulks(final String dataSource, final Collection<Path> subDirectories,
    final RecordOutputHandler directoryBulks) throws IOException, ObjectStoreException {
    for (final Path subDir : subDirectories) {
      final Record directoryRecord = _fileCrawler.dirToRecord(subDir, dataSource);
      directoryBulks.writeRecord(directoryRecord);
      if (_log.isDebugEnabled()) {
        _log.debug("added bulk for directory " + subDir.toRealPath());
      }
    }
  }

  /** if we are following symbolic links: check if the given directoy has been crawled by someone else. */
  private boolean checkAndMarkVisited(final Path directory, final FileCrawlingContext context,
    final String inputBulkId) throws ImportingException {
    if (context.getFilterConfig() != null && context.getFilterConfig().followSymbolicLinks()) {
      try {
        final String canonicalPath = getCanonicalPath(directory);
        return _visitedLinks.checkAndMarkVisited(context.getDataSource(), canonicalPath, context.getJobRunId(),
          inputBulkId);
      } catch (final IOException ex) {
        throw new ImportingException("Error checking " + directory + " for symbolic-link cycles", ex);
      }
    }
    // if we don't follow symbolic links, re-visits cannot happen.
    return false;
  }

  /** resolve all symbolic links in the path. */
  private String getCanonicalPath(final Path file) throws IOException {
    return file.toRealPath().toString();
  }

  /** checks if the crawled record is a compound record. */
  private boolean isCompoundRecord(final Record record) {
    return _compoundExtractor.canExtract(
      record.getMetadata().getStringValue(FileCrawlerService.PROPERTY_FILE_NAME), null);
  }

  /** mark record as compound. */
  private void setIsCompound(final Record record) {
    record.getMetadata().put(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG, true);
  }

  /** DS service reference bind method. */
  public void setFileCrawlerService(final FileCrawlerService fileCrawler) {
    _fileCrawler = fileCrawler;
  }

  /** DS service reference unbind method. */
  public void unsetFileCrawlerService(final FileCrawlerService fileCrawler) {
    if (_fileCrawler == fileCrawler) {
      _fileCrawler = null;
    }
  }

  /** DS service reference bind method. */
  public void setCompoundExtractor(final CompoundExtractor compoundExtractor) {
    _compoundExtractor = compoundExtractor;
  }

  /** DS service reference unbind method. */
  public void unsetCompoundExtractor(final CompoundExtractor compoundExtractor) {
    if (_compoundExtractor == compoundExtractor) {
      _compoundExtractor = null;
    }
  }

  /** DS service reference bind method. */
  public void setVisitedLinks(final VisitedLinksService visitedLinks) {
    _visitedLinks = visitedLinks;
  }

  /** DS service reference unbind method. */
  public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
    if (_visitedLinks == visitedLinks) {
      _visitedLinks = null;
    }
  }

}
