/*********************************************************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.file.test;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;

import org.apache.commons.io.IOUtils;
import org.eclipse.smila.bulkbuilder.BulkbuilderService;
import org.eclipse.smila.common.definitions.DefinitionBase;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.file.FileCrawlerService;
import org.eclipse.smila.importing.util.FilePathNormalizer;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;
import org.eclipse.smila.jobmanager.JobRunEngine;
import org.eclipse.smila.jobmanager.JobState;
import org.eclipse.smila.jobmanager.definitions.DefinitionPersistence;
import org.eclipse.smila.jobmanager.definitions.JobDefinition;
import org.eclipse.smila.jobmanager.definitions.JobManagerConstants;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.objectstore.ObjectStoreService;
import org.eclipse.smila.objectstore.StoreObject;
import org.eclipse.smila.test.DeclarativeServiceTestCase;
import org.eclipse.smila.utils.config.ConfigUtils;

/**
 * Test cases for the FileExtractorWorker. Also tests the filtering applied by the worker.
 * 
 */
public class TestFileExtractorWorker extends DeclarativeServiceTestCase {

  private static final String JOBNAME_MINIMAL_MAPPING = "extractCompoundsMinimalMapping";

  private static final String DATA_SOURCE = "compounds";

  private static final String STORE = "records";

  private static final String BUCKET = "extractedRecords";

  private static final String JOBNAME = "extractCompounds";

  private static final String JOBNAME_ALL = "extractCompoundsAll";

  private static final String JOBNAME_FOLDER_FILTER = "extractCompoundsFolderPath";

  private JobRunEngine _jobRunEngine;

  private JobRunDataProvider _jobRunDataProvider;

  private BulkbuilderService _bulkBuilder;

  private ObjectStoreService _objectStore;

  private DefinitionPersistence _defPersistence;

  private final PropertyNameMapper _mapper = new PropertyNameMapper(initMapping());

  @Override
  public void setUp() throws Exception {
    _jobRunEngine = getService(JobRunEngine.class);
    _jobRunDataProvider = getService(JobRunDataProvider.class);
    _bulkBuilder = getService(BulkbuilderService.class);
    _defPersistence = getService(DefinitionPersistence.class);
    _objectStore = getService(ObjectStoreService.class);
    _objectStore.ensureStore(STORE);
    _objectStore.clearStore(STORE);
  }

  /** test of zip file containing umlauts. */
  public void testZipWithUmlautsAndCompounds() throws Exception {
    final String jobName = getName();
    copyJob(JOBNAME, jobName);
    final String jobRunId = _jobRunEngine.startJob(jobName);
    final Path compoundFile = ConfigUtils.getConfigFile("compounds", "test.zip").toPath();
    final Record compound = DataFactory.DEFAULT.createRecord(compoundFile.toRealPath().toString(), "compounds");
    compound.getMetadata().put(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0),
      compoundFile.toRealPath().toString());
    _bulkBuilder.addRecord(jobName, compound);
    _bulkBuilder.commitJob(jobName);
    _jobRunEngine.finishJob(jobName, jobRunId);
    waitForJobRunCompleted(jobName, jobRunId, 10000);
    final Collection<Record> records = checkExtractedRecordsBulk(compound);
    assertEquals(7, records.size());
  }

  /** extract with filters applied. */
  public void testExtractWithFilter() throws Exception {
    final String jobName = getName();
    copyJob(JOBNAME, jobName);
    final String jobRunId = _jobRunEngine.startJob(jobName);
    final Path compoundFile = ConfigUtils.getConfigFile("compounds", "filter-test.zip").toPath();
    final Record compound = DataFactory.DEFAULT.createRecord(compoundFile.toRealPath().toString(), "compounds");
    compound.getMetadata().put(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0),
      compoundFile.toRealPath().toString());
    // get normalized prefix to check if each record's path is prepended by it.
    final String normalizedCompoundPath = FilePathNormalizer.getNormalizedPath(compoundFile);
    _bulkBuilder.addRecord(jobName, compound);
    _bulkBuilder.commitJob(jobName);
    _jobRunEngine.finishJob(jobName, jobRunId);
    waitForJobRunCompleted(jobName, jobRunId, 10000);
    final Collection<Record> records = checkExtractedRecordsBulk(compound);
    assertEquals(7, records.size());
    // two are missing: too-large.txt and the xlsx file...
    final Collection<String> fileNames = new HashSet<String>();
    for (final Record record : records) {
      fileNames.add(record.getMetadata().getStringValue("my-file-name"));
      final String filePath = record.getMetadata().getStringValue("my-file-path");
      assertTrue(filePath + " has wrong prefix",
        FilePathNormalizer.getNormalizedPath(filePath).startsWith(normalizedCompoundPath));
    }
    // let's ignore the links here...
    assertFalse(fileNames.contains("not-allowed.xlsx"));
    assertFalse(fileNames.contains("too-large.txt"));
    assertTrue(fileNames.contains("pdf.pdf"));
    assertTrue(fileNames.contains("text.txt"));
    assertTrue(fileNames.contains("text-within-folders.txt"));
  }

  /** extract with filters applied to a gz compressed file. */
  public void testExtractWithFilterAndGzip() throws Exception {
    final String jobName = getName();
    copyJob(JOBNAME, jobName);
    final String jobRunId = _jobRunEngine.startJob(jobName);
    final Path compoundFile = ConfigUtils.getConfigFile("compounds", "not-allowed.xlsx.gz").toPath();
    final Record compound = DataFactory.DEFAULT.createRecord(compoundFile.toRealPath().toString(), "compounds");
    compound.getMetadata().put(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0),
      compoundFile.toRealPath().toString());
    _bulkBuilder.addRecord(jobName, compound);
    _bulkBuilder.commitJob(jobName);
    _jobRunEngine.finishJob(jobName, jobRunId);
    waitForJobRunCompleted(jobName, jobRunId, 10000);
    final Collection<Record> records = checkExtractedRecordsBulk(compound);
    assertEquals(1, records.size());
    // two are missing: too-large.txt and the xlsx file...
    final Collection<String> fileNames = new HashSet<String>();
    for (final Record record : records) {
      fileNames.add(record.getMetadata().getStringValue("my-file-name"));
    }
    // only the component record will be listed
    assertFalse(fileNames.contains("not-allowed.xlsx"));
    assertTrue(fileNames.contains("not-allowed.xlsx.gz"));
  }

  /** extract with unlimited filters applied. */
  public void testExtractWithUnlimitedFilter() throws Exception {
    final String jobRunId = _jobRunEngine.startJob(JOBNAME_ALL);
    final Path compoundFile = ConfigUtils.getConfigFile("compounds", "filter-test.zip").toPath();
    final Record compound = DataFactory.DEFAULT.createRecord(compoundFile.toRealPath().toString(), "compounds");
    compound.getMetadata().put(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0),
      compoundFile.toRealPath().toString());
    _bulkBuilder.addRecord(JOBNAME_ALL, compound);
    _bulkBuilder.commitJob(JOBNAME_ALL);
    _jobRunEngine.finishJob(JOBNAME_ALL, jobRunId);
    waitForJobRunCompleted(JOBNAME_ALL, jobRunId, 10000);
    final Collection<Record> records = checkExtractedRecordsBulk(compound);
    assertEquals(9, records.size());
    // no files are missing
    final Collection<String> fileNames = new HashSet<String>();
    for (final Record record : records) {
      fileNames.add(record.getMetadata().getStringValue("my-file-name"));
    }
    // let's ignore the links here...
    assertTrue(fileNames.contains("not-allowed.xlsx"));
    assertTrue(fileNames.contains("too-large.txt"));
    assertTrue(fileNames.contains("pdf.pdf"));
    assertTrue(fileNames.contains("text.txt"));
    assertTrue(fileNames.contains("text-within-folders.txt"));
  }

  /** extract with folder path filters applied. */
  public void testExtractWithFolderPathFilter() throws Exception {
    final String jobRunId = _jobRunEngine.startJob(JOBNAME_FOLDER_FILTER);
    final Path compoundFile = ConfigUtils.getConfigFile("compounds", "filter-test.zip").toPath();
    final Record compound = DataFactory.DEFAULT.createRecord(compoundFile.toRealPath().toString(), "compounds");
    compound.getMetadata().put(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0),
      compoundFile.toRealPath().toString());
    _bulkBuilder.addRecord(JOBNAME_FOLDER_FILTER, compound);
    _bulkBuilder.commitJob(JOBNAME_FOLDER_FILTER);
    _jobRunEngine.finishJob(JOBNAME_FOLDER_FILTER, jobRunId);
    waitForJobRunCompleted(JOBNAME_FOLDER_FILTER, jobRunId, 10000);
    final Collection<Record> records = checkExtractedRecordsBulk(compound);
    assertEquals(8, records.size());
    final Collection<String> fileNames = new HashSet<String>();
    for (final Record record : records) {
      fileNames.add(record.getMetadata().getStringValue("my-file-name"));
    }
    // let's ignore the links here...
    assertTrue(fileNames.contains("not-allowed.xlsx"));
    assertTrue(fileNames.contains("too-large.txt"));
    assertTrue(fileNames.contains("pdf.pdf"));
    assertTrue(fileNames.contains("text.txt"));
    assertFalse(fileNames.contains("text-within-folders.txt"));
  }

  /** extract with minimal mapping. */
  public void testExtractWithMinimalMapping() throws Exception {
    final String jobRunId = _jobRunEngine.startJob(JOBNAME_MINIMAL_MAPPING);
    final Path compoundFile = ConfigUtils.getConfigFile("compounds", "filter-test.zip").toPath();
    final Record compound = DataFactory.DEFAULT.createRecord(compoundFile.toRealPath().toString(), "compounds");
    compound.getMetadata().put(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0),
      compoundFile.toRealPath().toString());
    _bulkBuilder.addRecord(JOBNAME_MINIMAL_MAPPING, compound);
    _bulkBuilder.commitJob(JOBNAME_MINIMAL_MAPPING);
    _jobRunEngine.finishJob(JOBNAME_MINIMAL_MAPPING, jobRunId);
    waitForJobRunCompleted(JOBNAME_MINIMAL_MAPPING, jobRunId, 10000);
    final Collection<Record> records = getExtractedRecordsBulk(compound);
    assertEquals(9, records.size());
    final Collection<String> filePaths = new HashSet<String>();
    final Collection<byte[]> content = new ArrayList<byte[]>();
    for (final Record record : records) {
      filePaths.add(record.getMetadata().getStringValue("my-file-path"));
      if (!record.getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)
        || !record.getMetadata().getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)) {
        assertTrue(record.hasAttachment(_mapper.get(FileCrawlerService.ATTACHMENT_FILE_CONTENT).get(0)));
        final byte[] att =
          record.getAttachmentAsBytes(_mapper.get(FileCrawlerService.ATTACHMENT_FILE_CONTENT).get(0));
        assertNotNull(att);
        assertTrue(att.length > 0);
        content.add(att);
      }
    }
    assertEquals(8, content.size());
    assertEquals(9, filePaths.size());
  }

  /** copies the job definition. */
  private void copyJob(final String sourceJob, final String targetJob) throws Exception {
    final JobDefinition jobDef = _defPersistence.getJob(sourceJob);
    final AnyMap jobAny = jobDef.toAny(true);
    jobAny.put(DefinitionBase.KEY_NAME, targetJob);
    _defPersistence.addJob(new JobDefinition(jobAny));
  }

  /** check extracted records and return a list of these records. */
  private Collection<Record> checkExtractedRecordsBulk(final Record compound) throws ObjectStoreException,
    IOException {
    int recordCount = 0;
    final Collection<Record> records = getExtractedRecordsBulk(compound);

    for (final Record record : records) {
      recordCount++;
      assertNotNull(record.getId());
      assertEquals(DATA_SOURCE, record.getSource());
      final AnyMap metadata = record.getMetadata();
      checkRecord(compound, recordCount, record, metadata);
    }
    return records;
  }

  /** reads the records from the bulk. */
  private Collection<Record> getExtractedRecordsBulk(final Record compound) throws ObjectStoreException,
    IOException {
    final Collection<Record> records = new ArrayList<Record>();
    final Collection<StoreObject> objects = _objectStore.getStoreObjectInfos(STORE, BUCKET);
    assertEquals(1, objects.size());
    final InputStream bulkStream = _objectStore.readObject(STORE, objects.iterator().next().getId());
    try {
      final BinaryObjectStreamIterator streamIterator = new BinaryObjectStreamIterator(bulkStream);
      while (streamIterator.hasNext()) {
        final Record record = streamIterator.next();
        assertNotNull(record);
        records.add(record);
      }
    } finally {
      IOUtils.closeQuietly(bulkStream);
    }
    return records;
  }

  /** checks the extracted records for their mapped attributes. */
  private void checkRecord(final Record compound, final int recordCount, final Record record, final AnyMap metadata) {
    if (recordCount == 1) { // first should be the enriched original
      assertEquals(compound.getId(), record.getId());
      assertEquals(compound.getSource(), record.getSource());
      assertMappedAttributePresent(metadata, FileCrawlerService.PROPERTY_FILE_PATH, true);
      assertTrue(metadata.getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
      assertFalse(metadata.containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDID));
      assertFalse(metadata.containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDPATH));
    } else {
      assertTrue(metadata.containsKey(ImportingConstants.ATTRIBUTE_DELTA_HASH));
      assertMappedAttributePresent(metadata, FileCrawlerService.PROPERTY_FILE_NAME, true);
      assertMappedAttributePresent(metadata, FileCrawlerService.PROPERTY_FILE_PATH, true);
      assertMappedAttributePresent(metadata, FileCrawlerService.PROPERTY_FILE_FOLDER, true);
      assertTrue(metadata.get(_mapper.get(FileCrawlerService.PROPERTY_FILE_LAST_MODIFIED).get(0)).isDateTime());
      assertTrue(metadata.get(_mapper.get(FileCrawlerService.PROPERTY_FILE_SIZE).get(0)).isLong());
      assertTrue(metadata.get(ImportingConstants.ATTRIBUTE_COMPOUNDPATH).isSeq());
      assertEquals(
        compound.getMetadata().getStringValue(_mapper.get(FileCrawlerService.PROPERTY_FILE_PATH).get(0)), metadata
          .getSeq(ImportingConstants.ATTRIBUTE_COMPOUNDPATH).getStringValue(0));
      assertEquals(compound.getId(), metadata.getStringValue(ImportingConstants.ATTRIBUTE_COMPOUNDID));
      if (metadata.containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG)) {
        assertTrue(metadata.getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
        for (final String mappedAttName : _mapper.get(FileCrawlerService.ATTACHMENT_FILE_CONTENT)) {
          assertFalse(record.hasAttachment(mappedAttName));
        }
      } else {
        byte[] previousAttachment = null;
        for (final String mappedAttName : _mapper.get(FileCrawlerService.ATTACHMENT_FILE_CONTENT)) {
          assertTrue(record.hasAttachment(mappedAttName));
          final byte[] currentAttachment = record.getAttachmentAsBytes(mappedAttName);
          assertNotNull(currentAttachment);
          assertTrue(currentAttachment.length > 0);
          if (previousAttachment != null) {
            assertTrue(Arrays.equals(previousAttachment, currentAttachment));
          } else {
            previousAttachment = currentAttachment;
          }
        }
        assertEquals(metadata.getLongValue(_mapper.get(FileCrawlerService.PROPERTY_FILE_SIZE).get(0)).longValue(),
          record.getAttachment(_mapper.get(FileCrawlerService.ATTACHMENT_FILE_CONTENT).get(0)).size());
      }
    }
  }

  /** checks if all mapped attributes are present. */
  private void assertMappedAttributePresent(final AnyMap metadata, final String attributeName, final boolean present) {
    for (final String mappedAttName : _mapper.get(attributeName)) {
      assertEquals("mapped attribute " + mappedAttName + " should be " + (!present ? "not" : "") + " be present.",
        present, metadata.containsKey(mappedAttName));
    }
  }

  /** Waits for a job to be completed. */
  protected void waitForJobRunCompleted(final String jobName, final String jobId, final long maxWaitTime)
    throws Exception {
    final long sleepTime = 500L;
    final long millisStarted = System.currentTimeMillis();
    while (true) {
      final AnyMap runData = _jobRunDataProvider.getJobRunData(jobName, jobId);
      final String jobRunState = runData.getStringValue(JobManagerConstants.DATA_JOB_STATE);
      if (jobRunState != null) {
        final JobState state = JobState.valueOf(jobRunState);
        assertNotSame(JobState.FAILED, state);
        if (state == JobState.SUCCEEDED) {
          return; // finally found what we're waiting for.
        }
      }
      assertTrue("Waited too long for job to complete", System.currentTimeMillis() - millisStarted <= maxWaitTime);
      Thread.sleep(sleepTime);
    }
  }

  /** initialize the mapping. */
  private AnyMap initMapping() {
    final AnyMap map = DataFactory.DEFAULT.createAnyMap();
    map.put(FileCrawlerService.PROPERTY_FILE_EXTENSION, "my-file-extension");
    map.put(FileCrawlerService.PROPERTY_FILE_FOLDER, "my-file-folder");
    map.put(FileCrawlerService.PROPERTY_FILE_LAST_MODIFIED, "my-file-last-modified");
    map.put(FileCrawlerService.PROPERTY_FILE_NAME, "my-file-name");
    map.put(FileCrawlerService.PROPERTY_FILE_PATH, "my-file-path");
    map.put(FileCrawlerService.PROPERTY_FILE_SIZE, "my-file-size");
    map.put(FileCrawlerService.ATTACHMENT_FILE_CONTENT, "my-file-content");
    return map;
  }
}
