/*********************************************************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 **********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web.producer;

import java.util.Arrays;
import java.util.Collection;
import java.util.UUID;

import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.RecordProducer;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.taskworker.TaskLog;
import org.eclipse.smila.utils.digest.DigestHelper;

/** Really simple implementation of RecordProducer that just returns the unchanged input. */
public class SimpleRecordProducer implements RecordProducer {

  @Override
  public Collection<Record> produceRecords(final Record record, final AnyMap parameters, final TaskLog log)
    throws WebCrawlerException {
    final String url = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    final String source = parameters.getStringValue(ImportingConstants.TASK_PARAM_DATA_SOURCE);
    record.setId(source + ":" + url);
    record.setSource(source);

    calculateDeltaHash(record);
    return Arrays.asList(record);
  }

  /**
   * determine delta hash: if content is attached, calculate a diest. Else use the lastModified date reported by the
   * webserver. If none is present, use size (and content-type) reported by server. Finally, if size is not set,
   * generate a UUID as deltahash to force updating.
   * */
  private void calculateDeltaHash(final Record record) {
    final AnyMap metadata = record.getMetadata();
    String deltaHash = null;
    if (record.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
      deltaHash = DigestHelper.calculateDigest(record.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT));
    } else {
      final Any lastModified = metadata.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED);
      if (lastModified != null) {
        deltaHash = lastModified.toString();
      } else {
        final Any contentLength = metadata.get(WebCrawlerConstants.ATTRIBUTE_SIZE);
        if (contentLength != null) {
          deltaHash = contentLength.toString() + "-" + metadata.get(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE);
        } else {
          // no information suitable for calculating delta found -> force update.
          deltaHash = UUID.randomUUID().toString();
        }
      }
    }
    if (deltaHash != null) {
      metadata.put(ImportingConstants.ATTRIBUTE_DELTA_HASH, deltaHash);
    }
  }
}
