/***********************************************************************************************************************
 * Copyright (c) 2008,2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation
 **********************************************************************************************************************/
package org.eclipse.smila.importing.compounds.simple;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.UUID;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.common.mimetype.MimeTypeIdentifier;
import org.eclipse.smila.common.mimetype.MimeTypeParseException;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.util.AnyUtil;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.compounds.CompoundExtractorException;
import org.eclipse.smila.utils.config.ConfigUtils;

/**
 * Simple compound extractor that extracts only zip archives and gzip files.
 */
public class SimpleCompoundExtractorService implements CompoundExtractor {

  /** mime type for ZIP. */
  protected static final String APPLICATION_ZIP = "application/zip";

  /** mime types for ZIP and GZIP. */
  protected static final Collection<String> SUPPORTED_MIME_TYPES = Arrays.asList(APPLICATION_ZIP,
    "application/x-gunzip", "application/x-gzip");

  /** key for temporary file name. */
  protected static final String KEY_TMP_FILE_NAME = "tmpFileName";

  /** could be everything, have a look at the file suffix. */
  private static final String OCTET_STREAM = "application/octet-stream";

  /** default encoding to use for zip files. */
  private static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;

  /** bundle ID for configuration area access. */
  private static final String BUNDLE_ID = "org.eclipse.smila.importing.compounds.simple";

  /** mime type identifier service. */
  protected MimeTypeIdentifier _mimeTypeIdentifier;

  /** log. */
  protected final Log _log = LogFactory.getLog(getClass());

  /** encoding. */
  protected Charset _charset = DEFAULT_CHARSET;

  /** the directory where to store the temporary uncompressed files. */
  private File _rootTmpDir;

  /** service activation. */
  protected void activate() {
    Properties props;
    try {
      props = ConfigUtils.getConfigProperties(BUNDLE_ID, "extractor.properties");
    } catch (final Exception ex) {
      _log.info("No configuration " + BUNDLE_ID + "/" + " found, using default settings.");
      props = new Properties();
    }
    if (props.containsKey("zip.encoding")) {
      _charset = Charset.forName(props.getProperty("zip.encoding"));
    }
    final String rootTmpDirName = props.getProperty("tmp.dir", null);
    if (rootTmpDirName != null) {
      _rootTmpDir = new File(rootTmpDirName);
    } else {
      _rootTmpDir = new File(FileUtils.getTempDirectoryPath(), BUNDLE_ID);
    }
    // check if there are files left from any prior run:
    try {
      FileUtils.deleteDirectory(_rootTmpDir);
    } catch (final IOException e) {
      _log.warn("Could not delete old temporary files from previous invocation");
    }
  }

  /** service deactivation. */
  protected void deactivate() {
    if (_rootTmpDir.exists()) {
      try {
        FileUtils.deleteDirectory(_rootTmpDir);
      } catch (final IOException e) {
        _log.warn("Could not clean up temp extraction directory.", e);
      }
    }
  }

  /** {@inheritDoc} */
  @Override
  public boolean canExtract(final File file) {
    if (file == null) {
      return false;
    }
    return canExtract(file.getName());
  }

  /** {@inheritDoc} */
  @Override
  public boolean canExtract(final URL url, final String mimeType) {
    if (url == null) {
      return false;
    }
    return canExtract(url.getFile(), mimeType);
  }

  /** check if we can handle this. */
  @Override
  public boolean canExtract(final String fileName, final String mimeType) {
    if (fileName == null) {
      return false;
    }
    return canHandleMimeType(getMimeType(fileName, mimeType));
  }

  /**
   * {@inheritDoc}
   * 
   * @throws CompoundExtractorException
   */
  @Override
  public Iterator<Record> extract(final InputStream compoundInputStream, final String fileName,
    final String contentAttachmentName) throws CompoundExtractorException {
    return extract(compoundInputStream, fileName, null, contentAttachmentName);
  }

  /**
   * {@inheritDoc}. This extract method extracts entries on the fly, i.e. you must not close the input stream prior to
   * consume the last record from the iterator.
   * 
   * @throws CompoundExtractorException
   */
  @Override
  public Iterator<Record> extract(final InputStream compoundInputStream, final String fileName,
    final String mimeType, final String contentAttachmentName) throws CompoundExtractorException {
    final String extractedMimeType = getMimeType(fileName, mimeType);
    if (!canHandleMimeType(extractedMimeType)) {
      return new ArrayList<Record>().iterator();
    }

    final File tmpDir = new File(_rootTmpDir, UUID.randomUUID().toString());
    final List<Record> records =
      extractCompressedStream(compoundInputStream, fileName, extractedMimeType, tmpDir, new ArrayList<String>(),
        null);

    return new AttachmentSettingIterator(records, contentAttachmentName, tmpDir);
  }

  /**
   * @param mimeTypeIdentifier
   *          the mimeTypeIdentifier to set
   */
  public void setMimeTypeIdentifier(final MimeTypeIdentifier mimeTypeIdentifier) {
    this._mimeTypeIdentifier = mimeTypeIdentifier;
  }

  /**
   * @param mimeTypeIdentifier
   *          the mimeTypeIdentifier to set
   */
  public void unsetMimeTypeIdentifier(final MimeTypeIdentifier mimeTypeIdentifier) {
    if (this._mimeTypeIdentifier == mimeTypeIdentifier) {
      this._mimeTypeIdentifier = null;
    }
  }

  /** get mimetype for file with hints. */
  private String getMimeType(final String fileName, final String mimeType) {
    if (mimeType == null || mimeType.isEmpty() || mimeType.toLowerCase(Locale.ENGLISH).contains(OCTET_STREAM)) {
      return getMimeType(fileName);
    }
    return mimeType;
  }

  /** get mimetype for file. */
  private String getMimeType(final String fileName) {
    final int indexOfLastPeriod = fileName.lastIndexOf('.');
    if (indexOfLastPeriod >= 0 && fileName.length() > indexOfLastPeriod) {
      try {
        return _mimeTypeIdentifier.identify(fileName.substring(indexOfLastPeriod + 1));
      } catch (final MimeTypeParseException e) {
        _log.warn("Cannot detect mime type for '" + fileName + "'.", e);
      }
    }
    return null;
  }

  /** decides if the service can extract the files depending on the file name. */
  private boolean canExtract(final String fileName) {
    return canHandleMimeType(getMimeType(fileName));
  }

  /** check the mimetype for zip/gzip. */
  private boolean canHandleMimeType(final String mimeType) {
    if (mimeType == null) {
      return false;
    }
    return SUPPORTED_MIME_TYPES.contains(mimeType.toLowerCase(Locale.ENGLISH));
  }

  /** extracts compressed stream, zip or gzip. */
  private List<Record> extractCompressedStream(final InputStream compoundInputStream, final String fileName,
    final String extractedMimeType, final File tmpDir, final List<String> compoundNames, final Record record)
    throws CompoundExtractorException {
    final List<Record> records = new ArrayList<Record>();
    // we need the name of this zip for the first compound
    final List<String> newCompoundNames = new ArrayList<String>(compoundNames);
    if (newCompoundNames.isEmpty()) {
      newCompoundNames.add(fileName);
    }
    if (record == null) {
      final Record newRecord = DataFactory.DEFAULT.createRecord(fileName);
      newRecord.getMetadata().put(KEY_IS_COMPOUND, true);
      newRecord.getMetadata().put(KEY_FILE_NAME, fileName);
      newRecord.getMetadata().put(KEY_IS_ROOT_COMPOUND_RECORD, true);
      records.add(newRecord);
    } else {
      records.add(record);
    }
    if (extractedMimeType.equalsIgnoreCase(APPLICATION_ZIP)) {
      records.addAll(extractZipContent(compoundInputStream, fileName, tmpDir, newCompoundNames));
    } else {
      records.addAll(extractGzipContent(compoundInputStream, fileName, tmpDir, newCompoundNames, record));
    }
    return records;
  }

  /** extracts gzip content. */
  private Collection<Record> extractGzipContent(final InputStream compoundInputStream, final String fileName,
    final File tmpDir, final List<String> compoundNames, final Record compoundRecord)
    throws CompoundExtractorException {
    final Collection<Record> records = new ArrayList<Record>();
    try {
      final File tmp = new File(fileName);
      final String tmpName = tmp.getName();
      final String gunzippedFileName = tmpName.substring(0, tmpName.lastIndexOf('.'));
      final File destFile = gunzip(compoundInputStream, gunzippedFileName, tmpDir);
      final StringBuilder id = new StringBuilder();
      for (final String compoundName : compoundNames) {
        id.append(compoundName).append('/');
      }
      id.append(gunzippedFileName);
      final Record record = DataFactory.DEFAULT.createRecord(id.toString());
      record.getMetadata().put(KEY_FILE_NAME, gunzippedFileName);
      record.getMetadata().put(KEY_SIZE, destFile.length());
      if (compoundRecord != null && compoundRecord.getMetadata().containsKey(KEY_TIME)) {
        record.getMetadata().put(KEY_TIME, compoundRecord.getMetadata().get(KEY_TIME));
      }
      if (!compoundNames.isEmpty()) {
        record.getMetadata().put(KEY_COMPOUNDS, AnyUtil.objectToAny(compoundNames));
      }
      checkAndHandleCompoundFile(gunzippedFileName, tmpDir, compoundNames, records, destFile, record);
    } catch (final IOException e) {
      _log.warn("Cannot access zipped stream for '" + fileName + "'.", e);
    }
    return records;
  }

  /** check if a file within a compound is a compound itself and handle it. */
  private void checkAndHandleCompoundFile(final String fileName, final File tmpDir,
    final List<String> compoundNames, final Collection<Record> records, final File destFile, final Record record)
    throws CompoundExtractorException, IOException {
    // check if the new file is a compressed file, too:
    if (canExtract(destFile)) {
      final String mimeType = getMimeType(destFile.getName());
      record.getMetadata().put(KEY_IS_COMPOUND, true);
      final FileInputStream compressedFileStream = new FileInputStream(destFile);
      try {
        final List<String> newCompoundNames = new ArrayList<String>(compoundNames);
        newCompoundNames.add(fileName);
        records.addAll(extractCompressedStream(compressedFileStream, destFile.getName(), mimeType, new File(tmpDir,
          UUID.randomUUID().toString()), newCompoundNames, record));
      } finally {
        IOUtils.closeQuietly(compressedFileStream);
      }
      if (!destFile.delete()) {
        destFile.deleteOnExit();
      }
    } else {
      record.getMetadata().put(KEY_TMP_FILE_NAME, destFile.getCanonicalPath());
      records.add(record);
    }
  }

  /** extracts zip content. */
  private Collection<Record> extractZipContent(final InputStream compoundInputStream, final String fileName,
    final File tmpDir, final List<String> compoundNames) throws CompoundExtractorException {
    final Collection<Record> records = new ArrayList<Record>();
    final ZipInputStream zin = new ZipInputStream(compoundInputStream, _charset);
    ZipEntry entry;
    try {
      entry = zin.getNextEntry();
      while (entry != null) {
        if (!entry.isDirectory()) {
          final String entryName = entry.getName();
          final StringBuilder id = new StringBuilder();
          for (final String compoundName : compoundNames) {
            id.append(compoundName).append('/');
          }
          id.append(entryName);
          final Record record = DataFactory.DEFAULT.createRecord(id.toString());
          record.getMetadata().put(KEY_FILE_NAME, entry.getName());
          record.getMetadata().put(KEY_SIZE, entry.getSize());
          if (entry.getComment() != null) {
            record.getMetadata().put(KEY_COMMENT, entry.getComment());
          }
          record.getMetadata().put(KEY_COMPRESSED_SIZE, entry.getCompressedSize());
          if (entry.getTime() != -1) {
            record.getMetadata().put(KEY_TIME, DataFactory.DEFAULT.createDateTimeValue(new Date(entry.getTime())));
          }
          if (!compoundNames.isEmpty()) {
            record.getMetadata().put(KEY_COMPOUNDS, AnyUtil.objectToAny(compoundNames));
          }
          try {
            final File destFile = new File(tmpDir, entry.getName());
            if (!destFile.getParentFile().exists()) {
              destFile.getParentFile().mkdirs();
            }
            destFile.createNewFile();
            final FileOutputStream fos = new FileOutputStream(destFile);
            try {
              IOUtils.copy(zin, fos);
            } finally {
              IOUtils.closeQuietly(fos);
            }
            zin.closeEntry();

            checkAndHandleCompoundFile(entryName, tmpDir, compoundNames, records, destFile, record);
          } catch (final IOException e) {
            _log.warn("Some error occurred while trying to access zip entry for zip file '" + fileName + "'.", e);
          }
        }
        entry = zin.getNextEntry();
      }
    } catch (final IOException e) {
      _log.warn("Cannot access zipped stream for '" + fileName + "'.", e);
    }
    return records;
  }

  /** gunzip a file to a temp file. */
  private File gunzip(final InputStream compoundInputStream, final String gunzippedFileName, final File tmpDir)
    throws CompoundExtractorException, IOException {
    GZIPInputStream zin = null;
    try {
      zin = new GZIPInputStream(compoundInputStream);
      if (!tmpDir.exists()) {
        tmpDir.mkdirs();
      }
      final File destFile = File.createTempFile("tmp", gunzippedFileName, tmpDir);
      destFile.createNewFile();
      final FileOutputStream fos = new FileOutputStream(destFile);
      try {
        IOUtils.copy(zin, fos);
        return destFile;
      } finally {
        IOUtils.closeQuietly(fos);
      }
    } finally {
      IOUtils.closeQuietly(zin);
    }
  }

}
