package org.eclipse.smila.importing.crawler.jdbc;

import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.ImportingException;
import org.eclipse.smila.importing.util.RecordOutputHandler;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.utils.digest.DigestHelper;

public class JdbcCrawlerWorker implements Worker {

  /** Name of the worker, used in worker description and workflows. */
  public static final String NAME = "jdbcCrawler";

  /** name of output slot taking the file records to process in ETL. */
  public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";

  /** Maximum number of records in one bulk object. */
  public static final String TASK_PARAM_MAX_RECORDS_PER_BULK = "maxRecordsPerBulk";

  /** database url */
  public static final String TASK_PARAM_DB_URL = "dbUrl";

  /** database properties, e.g. user, password */
  public static final String TASK_PARAM_DB_PROPS = "dbProps";

  /** crawl sql statement */
  public static final String TASK_PARAM_CRAWL_SQL = "crawlSql";

  /** id columns */
  public static final String TASK_PARAM_ID_COLUMNS = "idColumns";

  /** delta columns */
  public static final String TASK_PARAM_DELTA_COLUMNS = "deltaColumns";

  /** max. size of binary content (BLOB etc.) in bytes. */
  public static final String TASK_PARAM_MAX_ATTACHMENT_SIZE = "maxAttachmentSize";

  /** default: write up to 1000 records to one file bulk. */
  public static final Long MAX_RECORDS_PER_BULK_DEFAULT = 1000L;

  /** default max size of binary content is 1 GB. */
  public static final Long MAX_ATTACHMENT_SIZE_DEFAULT = 1000L * 1000L * 1000L;

  /** separator used for id and delta generation */
  private static final String COLUMN_SEPARATOR = "-";

  private final Log _log = LogFactory.getLog(getClass());

  private DbAccessService _dbAccess;

  @Override
  public String getName() {
    return NAME;
  }

  @Override
  public void perform(final TaskContext taskContext) throws Exception {
    try {
      final JdbcCrawlingContext crawlContext = new JdbcCrawlingContext(taskContext);
      final RecordOutputHandler recordBulks =
        new RecordOutputHandler(taskContext.getOutputs(), crawlContext.getMaxRecordsPerBulk(),
          OUTPUT_SLOT_CRAWLED_RECORDS);

      // crawler is currently only triggered by task generator, read input from task
      // TODO: this will change if grouping is supported
      crawl(crawlContext, recordBulks);
    } catch (final Exception e) {
      _log.error(e);
      throw e;
    }
  }

  /** DS service reference bind method. */
  public void setDbAccessService(final DbAccessService dbAccess) {
    _dbAccess = dbAccess;
  }

  /** DS service reference unbind method. */
  public void unsetDbAccessService(final DbAccessService dbAccess) {
    if (_dbAccess == dbAccess) {
      _dbAccess = null;
    }
  }

  private void crawl(final JdbcCrawlingContext crawlContext, final RecordOutputHandler recordBulks)
    throws Exception {
    final Collection<Record> results =
      _dbAccess.executeSql(crawlContext.getDbUrl(), crawlContext.getDbProperties(), crawlContext.getCrawlSql(),
        crawlContext.getMaxAttachmentSize(), crawlContext.getMessages());
    for (final Record record : results) {
      augmentRecord(record, crawlContext);
      crawlContext.getMapper().mapNames(record, getColumnNames(record));
      recordBulks.writeRecord(record);
    }
  }

  private Record augmentRecord(final Record record, final JdbcCrawlingContext crawlContext)
    throws ImportingException {
    record.setId(createId(crawlContext, record));
    record.setSource(crawlContext.getDataSource());
    record.getMetadata().put(ImportingConstants.ATTRIBUTE_DELTA_HASH, createDeltaHash(crawlContext, record));
    return record;
  }

  private String createId(final JdbcCrawlingContext crawlContext, final Record record) throws ImportingException {
    return crawlContext.getDataSource() + ":" + concatColumnValues(crawlContext.getIdColumns(), record);
  }

  private String createDeltaHash(final JdbcCrawlingContext crawlContext, final Record record)
    throws ImportingException {
    return concatColumnValues(crawlContext.getDeltaColumns(), record);
  }

  private String concatColumnValues(final Any columnNames, final Record record) throws ImportingException {
    final StringBuffer idBuffer = new StringBuffer();
    if (columnNames != null) {
      final AnyMap metadata = record.getMetadata();
      for (final Any column : columnNames) {
        final String columnName = column.asValue().asString();
        if (metadata.containsKey(columnName)) {
          String value = metadata.getStringValue(columnName);
          if (value.length() > 100) {
            value = DigestHelper.calculateDigest(value);
          }
          idBuffer.append(value);
        } else if (record.hasAttachment(columnName)) {
          idBuffer.append(DigestHelper.calculateDigest(record.getAttachmentAsBytes(columnName)));
        } else {
          idBuffer.append("NULL");
        }
        idBuffer.append(COLUMN_SEPARATOR);
      }
      idBuffer.deleteCharAt(idBuffer.length() - 1); // remove last added column separator
    }
    return idBuffer.toString();
  }

  private Set<String> getColumnNames(final Record resultRecord) {
    final HashSet<String> columnNames = new HashSet<String>();
    columnNames.addAll(resultRecord.getMetadata().keySet());
    // remove keys with special meanings
    columnNames.remove(Record.RECORD_ID);
    columnNames.remove(Record.SOURCE);
    columnNames.remove(ImportingConstants.ATTRIBUTE_DELTA_HASH);
    final Iterator<String> it = resultRecord.getAttachmentNames();
    while (it.hasNext()) {
      columnNames.add(it.next());
    }
    return columnNames;
  }
}
