/*******************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
 * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
 * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.tika.test.manual;

import java.io.BufferedInputStream;

import org.apache.commons.io.IOUtils;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.tika.TikaPipelet;
import org.eclipse.smila.tika.internal.PageBreakWriteOutContentHandler;
import org.eclipse.smila.tika.test.AllTests;
import org.eclipse.smila.tika.test.ConverterPipelineTestBase;
import org.eclipse.smila.utils.config.ConfigUtils;

/**
 * Test for the TikaPipelet producing multi-part record when feature is enabled via parts attribute parameter.
 * 
 * This test should run after removing standard tika.deps bundle and updating downloaded tika dependencies in SMILA
 * extensions.
 */
public class TestMultiParts extends ConverterPipelineTestBase {

  private static final String TEXT_ATTRIBUTE = "Text"; // see TikaPipeline.bpel: <rec:Val
                                                       // key="outputName">Text</rec:Val>

  private static final String PAGE_NO_ATTRIBUTE = "page";

  private static final String PARTS_ATTRIBUTE = "parts";

  public void test() throws Exception {
    final String fileName = "PDF_FROM_MSWORD_2010.pdf";
    final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
    additionalRecordParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, false);
    additionalRecordParams.put(TikaPipelet.PROP_PAGE_BREAK, true);
    additionalRecordParams.put(TikaPipelet.PROP_PAGE_NUMBER_ATTRIBUTE, PAGE_NO_ATTRIBUTE);

    // first test without enabling multi-parts -> more than one record in result
    BufferedInputStream input = null;
    try {
      input = new BufferedInputStream(ConfigUtils.getConfigStream(AllTests.BUNDLE_ID, fileName));
      final String[] result = callPipeline(fileName, input, additionalRecordParams);
      assertEquals(2, result.length);
    } finally {
      IOUtils.closeQuietly(input);
    }

    // second test with enabling multi-parts -> one (multi-parts) record in result
    additionalRecordParams.put(TikaPipelet.PROP_PARTS_ATTRIBUTE, PARTS_ATTRIBUTE);
    try {
      input = new BufferedInputStream(ConfigUtils.getConfigStream(AllTests.BUNDLE_ID, fileName));
      final String[] result = callPipeline(fileName, input, additionalRecordParams);
      assertEquals(1, result.length);

      final String id = "key:" + fileName;
      assertEquals(id, result[0]);
      final AnyMap metadata = _blackboard.getMetadata(id);
      assertFalse("No Text attribute expected on top level", metadata.containsKey(TEXT_ATTRIBUTE));
      assertFalse("No page attribute expected on top level", metadata.containsKey(PAGE_NO_ATTRIBUTE));
      assertTrue("Parts attribute expected", metadata.containsKey(PARTS_ATTRIBUTE));
      final AnySeq parts = metadata.getSeq(PARTS_ATTRIBUTE);
      assertEquals(2, parts.size());
      long partNo = 0;
      for (Any partAny : parts) {
        partNo++;
        final AnyMap part = partAny.asMap();
        assertFalse("No parts attribute expected on part level", part.containsKey(PARTS_ATTRIBUTE));
        final String partText = part.getStringValue(TEXT_ATTRIBUTE);
        assertNotNull(partText);
        assertFalse(partText.contains(PageBreakWriteOutContentHandler.PAGE_START_TAG));
        assertFalse(partText.contains(PageBreakWriteOutContentHandler.PAGE_END_TAG));
        if (partNo == 1) {
          assertTrue(partText.contains("Test"));
        } else {
          assertTrue(partText.contains("Datenreihe"));
        }
        assertTrue("Page attribute expected on part level", part.containsKey(PAGE_NO_ATTRIBUTE));
        assertEquals(partNo, part.getLongValue(PAGE_NO_ATTRIBUTE).longValue());
      }
      assertEquals(metadata.getStringValue(Record.RECORD_ID), id);
      assertEquals(metadata.getStringValue(Record.SOURCE), "source");
      assertEquals(metadata.getStringValue(ConverterPipelineTestBase.FILENAME_ATTRIBUTE), fileName);
      assertEquals(metadata.getMap("_parameters"), additionalRecordParams);
    } finally {
      IOUtils.closeQuietly(input);
    }

    // third test with multi-parts set but pageBreak=false -> no multi-parts
    additionalRecordParams.put(TikaPipelet.PROP_PAGE_BREAK, false);
    try {
      input = new BufferedInputStream(ConfigUtils.getConfigStream(AllTests.BUNDLE_ID, fileName));
      final String[] result = callPipeline(fileName, input, additionalRecordParams);
      assertEquals(1, result.length);
      final String id = "key:" + fileName;
      assertEquals(id, result[0]);
      final AnyMap metadata = _blackboard.getMetadata(id);
      assertFalse("Parts attribute expected", metadata.containsKey(PARTS_ATTRIBUTE));
      final String text = metadata.getStringValue(TEXT_ATTRIBUTE);
      assertTrue(text.contains("Test"));
      assertTrue(text.contains("Datenreihe"));
      assertEquals(metadata.getStringValue(Record.RECORD_ID), id);
      assertEquals(metadata.getStringValue(Record.SOURCE), "source");
      assertEquals(metadata.getStringValue(ConverterPipelineTestBase.FILENAME_ATTRIBUTE), fileName);
      assertEquals(metadata.getMap("_parameters"), additionalRecordParams);
    } finally {
      IOUtils.closeQuietly(input);
    }
  }
}
