/*********************************************************************************************************************
 * Copyright (c) 2008, 2014 Attensity Europe GmbH, Empolis Information Management GmbH and brox IT Solutions GmbH. All
 * rights reserved. This program and the accompanying materials are made available under the terms of the Eclipse Public
 * License v1.0 which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.file.test;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.crawler.file.FileCrawlerService;
import org.eclipse.smila.importing.crawler.file.FileCrawlerWorker;
import org.eclipse.smila.importing.crawler.file.filter.FilterConfiguration;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;
import org.eclipse.smila.jobmanager.JobRunEngine;
import org.eclipse.smila.jobmanager.JobState;
import org.eclipse.smila.jobmanager.definitions.DefinitionPersistence;
import org.eclipse.smila.jobmanager.definitions.JobDefinition;
import org.eclipse.smila.jobmanager.definitions.JobManagerConstants;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.objectstore.ObjectStoreService;
import org.eclipse.smila.objectstore.StoreObject;
import org.eclipse.smila.test.DeclarativeServiceTestCase;
import org.eclipse.smila.utils.config.ConfigUtils;
import org.eclipse.smila.utils.workspace.WorkspaceHelper;

public class TestFileCrawlerWorker extends DeclarativeServiceTestCase {

  private static final int DEFAULT_WAIT_TIME = 20000;

  private static final String DATA_SOURCE = "crawlCompounds";

  private static final String STORE = "records";

  private static final String BUCKET = "crawledCompounds";

  private static final String WORKFLOWNAME = "crawlCompounds";

  private JobRunEngine _jobRunEngine;

  private JobRunDataProvider _jobRunDataProvider;

  private ObjectStoreService _objectStore;

  private DefinitionPersistence _definitionPersistence;

  private Path _rootFolder;

  private Path _rootFolderDeep;

  private final PropertyNameMapper _mapper = new PropertyNameMapper(initMapping());

  @Override
  public void setUp() throws Exception {
    _jobRunEngine = getService(JobRunEngine.class);
    _jobRunDataProvider = getService(JobRunDataProvider.class);
    _objectStore = getService(ObjectStoreService.class);
    _objectStore.ensureStore(STORE);
    _objectStore.clearStore(STORE);
    _definitionPersistence = getService(DefinitionPersistence.class);
    _rootFolder = FilesystemHelper.initComplexTestFilesystem();
    _rootFolderDeep = FilesystemHelper.initComplexTestFilesystemDeepFolderStructure();
  }

  @Override
  protected void tearDown() throws Exception {
    try {
      FilesystemHelper.deleteDirectory(_rootFolder);
    } catch (final IOException ex) {
      ex.printStackTrace();
    }
    try {
      FilesystemHelper.deleteDirectory(_rootFolderDeep);
    } catch (final IOException ex) {
      ex.printStackTrace();
    }
    super.tearDown();
  }

  /** test crawling with a simple configuration. */
  public void testSimpleCrawling() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(4);
  }

  /** test crawling with a default max files bulk sizes. */
  public void testCrawlingWithDefaults() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MAX_FILES_PER_BULK, 1000);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 0);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, 1);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(31);
  }

  public void testCrawlingMaxFilesPerBulk() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MAX_FILES_PER_BULK, 5);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 0);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, 1);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(62);
  }

  public void testCrawlingMinFilesPerBulk20() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, 1);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 20);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(26);
  }

  public void testCrawlingMinFilesPerBulk10() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 10);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(5);
  }

  /** tests that 'minFilesPerBulk' is also effective in deep sub-dir-structures. */
  public void testCrawlingMinFilesPerBulkWithSubdirs() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolderDeep, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 999);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    // 1 task for root-folder crawls it all
    checkBulks(1);
  }

  public void testCrawlingMinMaxFilesPerBulk_25_30() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, 1);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 25);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MAX_FILES_PER_BULK, 30);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(23);
  }

  public void testCrawlingMinMaxFilesPerBulk_21_25() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, 1);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 21);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MAX_FILES_PER_BULK, 25);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(27);
  }

  public void testCrawlingMinMaxFilesPerBulkExceptions() throws Exception {
    AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 25);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MAX_FILES_PER_BULK, -1);
    defineJob(jobDefinition);
    String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRun(getName(), jobRunId, DEFAULT_WAIT_TIME, JobState.FAILED);

    Thread.sleep(100);

    jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 25);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MAX_FILES_PER_BULK, 20);
    final String jobName = getName() + "_2";
    jobDefinition.put(JobDefinition.KEY_NAME, jobName);
    defineJob(jobDefinition);
    jobRunId = _jobRunEngine.startJob(jobName);
    waitForJobRun(jobName, jobRunId, DEFAULT_WAIT_TIME, JobState.FAILED);
  }

  public void testDirectoriesPerBulk() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_MIN_FILES_PER_BULK, 0);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, 5);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkBulks(7); // 1 root dir, 1 task for 5 subdirs, 5 tasks for 5x5 sub-subdirs (5 each)
  }

  public void testInvalidDirectoriesPerBulk() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).put(FileCrawlerWorker.TASK_PARAM_DIRS_PER_BULK, -1);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRun(getName(), jobRunId, DEFAULT_WAIT_TIME, JobState.FAILED);
  }

  /** test that file permission info (Access Control List) is returned. */
  public void testACL() throws Exception {
    final AnyMap jobDefinition = createJob(_rootFolder, null);
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).getMap(ImportingConstants.TASK_PARAM_MAPPING)
      .put(FileCrawlerService.PROPERTY_FILE_READ_ACL, "ReadAccess");
    jobDefinition.getMap(JobDefinition.KEY_PARAMETERS).getMap(ImportingConstants.TASK_PARAM_MAPPING)
      .put(FileCrawlerService.PROPERTY_FILE_WRITE_ACL, "WriteAccess");
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRun(getName(), jobRunId, DEFAULT_WAIT_TIME, JobState.SUCCEEDED);

    // check that crawled records contain ACL infos.
    final Collection<StoreObject> objects = _objectStore.getStoreObjectInfos(STORE, BUCKET);
    assertFalse(objects.isEmpty());
    for (final StoreObject objectInfo : objects) {
      final InputStream bulkStream = _objectStore.readObject(STORE, objectInfo.getId());
      try (final BinaryObjectStreamIterator records = new BinaryObjectStreamIterator(bulkStream)) {
        while (records.hasNext()) {
          final Record record = records.next();
          assertTrue(record.getMetadata().containsKey("ReadAccess"));
          assertTrue(record.getMetadata().containsKey("WriteAccess"));
        }
      }
    }
  }

  private void checkBulks(final int expectedNumberOfBulks) throws Exception {
    final Collection<StoreObject> objects = _objectStore.getStoreObjectInfos(STORE, BUCKET);
    assertEquals(expectedNumberOfBulks, objects.size());
  }

  /** test crawling of a compound and a normal file. */
  public void testZipWithUmlautsAndCompounds() throws Exception {
    final AnySeq expectedResults = DataFactory.DEFAULT.createAnySeq();
    final AnyMap multiZip = DataFactory.DEFAULT.createAnyMap();
    expectedResults.add(multiZip);
    final Path compoundDir = copyConfigFilesToWorkspace("crawlCompounds", getName());
    final Path compoundFile = compoundDir.resolve("multizip.zip");
    final AnyMap jobDefinition = createJob(compoundDir, null);
    defineJob(jobDefinition);
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    checkCrawledRecords(compoundFile, "jobs.json");
  }

  /** test if directories are marked as visited when crawling with followSymbolicLinks option. */
  public void testVisitedLinksUsedWhenCrawlingWithFollowingSymLinks() throws Exception {
    final VisitedLinksService _visitedLinks = getService(VisitedLinksService.class);
    _visitedLinks.clearAll();
    final Path crawlDir = copyConfigFilesToWorkspace("../filter", getName());
    final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
    filterParams.put("followSymbolicLinks", true);
    defineJob(createJob(crawlDir, filterParams));
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    assertTrue(_visitedLinks.getSourceIds().contains(DATA_SOURCE));
    assertEquals(7, _visitedLinks.countEntries(DATA_SOURCE, true));
  }

  /** test if directories are not marked as visited when crawling without followSymbolicLinks option. */
  public void testVisitedLinksNotUsedWhenCrawlingWithoutFollowingSymLinks() throws Exception {
    final VisitedLinksService _visitedLinks = getService(VisitedLinksService.class);
    _visitedLinks.clearAll();
    final Path crawlDir = copyConfigFilesToWorkspace("../filter", getName());
    final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
    filterParams.put("followSymbolicLinks", false);
    defineJob(createJob(crawlDir, filterParams));
    final String jobRunId = _jobRunEngine.startJob(getName());
    waitForJobRunCompleted(getName(), jobRunId, DEFAULT_WAIT_TIME);
    assertFalse(_visitedLinks.getSourceIds().contains(DATA_SOURCE));
  }

  /** copy a directory from configuration to workspace, skip .svn directories. */
  private Path copyConfigFilesToWorkspace(final String configDirName, final String workspaceDirName)
    throws IOException {
    final Path configDir = ConfigUtils.getConfigFile(AllTests.BUNDLE_ID, configDirName).toPath();
    final Path workspaceDir = WorkspaceHelper.createWorkingDir(AllTests.BUNDLE_ID, workspaceDirName).toPath();
    FilesystemHelper.copyDirectory(configDir, workspaceDir, "**.svn");
    return workspaceDir;
  }

  /** checks the crawled files. If no compound file is expected, compounds should be null. */
  private void checkCrawledRecords(final Path compound, final String... nonCompounds) throws ObjectStoreException,
    IOException {
    final List<Record> crawledRecords = new ArrayList<Record>();
    final Collection<StoreObject> objects = _objectStore.getStoreObjectInfos(STORE, BUCKET);
    int recordCount = 0;
    assertFalse(objects.isEmpty());
    for (final StoreObject objectInfo : objects) {
      final InputStream bulkStream = _objectStore.readObject(STORE, objectInfo.getId());
      try (final BinaryObjectStreamIterator records = new BinaryObjectStreamIterator(bulkStream)) {
        while (records.hasNext()) {
          final Record record = records.next();
          assertNotNull(record);
          recordCount++;
          assertNotNull(record.getId());
          assertEquals(DATA_SOURCE, record.getSource());
          final AnyMap metadata = record.getMetadata();
          if (compound != null
            && compound.getFileName().toString()
              .equals(metadata.getStringValue(_mapper.get(FileCrawlerService.PROPERTY_FILE_NAME).get(0)))) {
            assertEquals(Files.size(compound),
              metadata.getLongValue(_mapper.get(FileCrawlerService.PROPERTY_FILE_SIZE).get(0)).longValue());
            assertEquals(Files.getLastModifiedTime(compound).toMillis(),
              metadata.getDateTimeValue(_mapper.get(FileCrawlerService.PROPERTY_FILE_LAST_MODIFIED).get(0))
                .getTime());
            assertTrue(metadata.getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
            for (final String attName : _mapper.get(FileCrawlerService.ATTACHMENT_FILE_CONTENT)) {
              assertFalse(record.hasAttachment(attName));
            }
          } else {
            assertTrue(metadata.containsKey(ImportingConstants.ATTRIBUTE_DELTA_HASH));
            assertTrue(metadata.containsKey(_mapper.get(FileCrawlerService.PROPERTY_FILE_NAME).get(0)));
            assertTrue(metadata.containsKey(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0)));
            assertTrue(metadata.containsKey(_mapper.get(FileCrawlerService.PROPERTY_FILE_FOLDER).get(0)));
            assertTrue(metadata.get(_mapper.get(FileCrawlerService.PROPERTY_FILE_LAST_MODIFIED).get(0))
              .isDateTime());
            assertTrue(metadata.get(_mapper.get(FileCrawlerService.PROPERTY_FILE_SIZE).get(0)).isLong());
            assertFalse(metadata.containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDPATH));
            assertFalse(metadata.containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
            // crawled file has no attachments...
            assertFalse(record.hasAttachments());
          }
          crawledRecords.add(record);
        }
      } finally {
        IOUtils.closeQuietly(bulkStream);
      }
    }
    assertEquals(compound == null ? 0 : 1 + nonCompounds.length, recordCount); // eventually one compound
  }

  private AnyMap createJob(final Path rootFolder, final AnyMap filterParams) throws Exception {
    final AnyMap job = DataFactory.DEFAULT.createAnyMap();
    job.put(JobDefinition.KEY_NAME, getName());
    job.put(JobDefinition.KEY_WORKFLOW, WORKFLOWNAME);
    final AnyMap parameters = job.getMap(JobDefinition.KEY_PARAMETERS, true);
    parameters.put("tempStore", STORE);
    parameters.put("store", STORE);
    parameters.put(ImportingConstants.TASK_PARAM_DATA_SOURCE, DATA_SOURCE);
    parameters.put(FileCrawlerWorker.TASK_PARAM_ROOT_FOLDER, rootFolder.toRealPath().toString());
    parameters.put(ImportingConstants.TASK_PARAM_MAPPING, initMapping());
    if (filterParams == null) {
      parameters.put(ImportingConstants.TASK_PARAM_FILTERS, initFilters());
    } else {
      parameters.put(ImportingConstants.TASK_PARAM_FILTERS, filterParams);
    }
    return job;
  }

  private void defineJob(final AnyMap jobDefinition) throws Exception {
    _definitionPersistence.addJob(new JobDefinition(jobDefinition));
  }

  protected void waitForJobRunCompleted(final String jobName, final String jobId, final long maxWaitTime)
    throws Exception {
    waitForJobRun(jobName, jobId, maxWaitTime, JobState.SUCCEEDED);
  }

  /** Waits for a job to be completed. */
  protected void waitForJobRun(final String jobName, final String jobId, final long maxWaitTime,
    final JobState expectedJobState) throws Exception {
    final long sleepTime = 500L;
    final long millisStarted = System.currentTimeMillis();
    while (true) {
      final AnyMap runData = _jobRunDataProvider.getJobRunData(jobName, jobId);
      final String jobRunState = runData.getStringValue(JobManagerConstants.DATA_JOB_STATE);
      if (jobRunState != null) {
        final JobState state = JobState.valueOf(jobRunState);
        if (state == expectedJobState) {
          return; // finally found what we're waiting for.
        }
        if (expectedJobState == JobState.SUCCEEDED) {
          assertNotSame(JobState.FAILED, state);
        }
      }
      assertTrue("Waited too long for job to complete", System.currentTimeMillis() - millisStarted <= maxWaitTime);
      Thread.sleep(sleepTime);
    }
  }

  /** initializes mapping. */
  private AnyMap initMapping() {
    final AnyMap map = DataFactory.DEFAULT.createAnyMap();
    map.put(FileCrawlerService.PROPERTY_FILE_EXTENSION, "my-file-extension");
    map.put(FileCrawlerService.PROPERTY_FILE_FOLDER, "my-file-folder");
    map.put(FileCrawlerService.PROPERTY_FILE_LAST_MODIFIED, "my-file-last-modified");
    map.put(FileCrawlerService.PROPERTY_FILE_NAME, "my-file-name");
    map.put(FileCrawlerService.PROPERTY_FILE_PATH, "my-file-path");
    map.put(FileCrawlerService.PROPERTY_FILE_SIZE, "my-file-size");
    map.put(FileCrawlerService.ATTACHMENT_FILE_CONTENT, "my-file-content");
    return map;
  }

  /** @return an initial filter with max size set to 100000 bytes, and excluding xls(x) files. */
  private AnyMap initFilters() {
    final AnyMap map = DataFactory.DEFAULT.createAnyMap();
    map.put(FilterConfiguration.MAX_SIZE, "1000");
    final AnyMap filePatterns = DataFactory.DEFAULT.createAnyMap();
    filePatterns.add(FilterConfiguration.EXCLUDE_PATTERNS, DataFactory.DEFAULT.createStringValue(".*\\.xlsx?$"));
    map.put(FilterConfiguration.FILE_PATTERNS, filePatterns);
    return map;
  }
}
