/*******************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
 * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
 * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.tika.test;

import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.tika.TikaPipelet;

/** Test with parameter 'maxLength' for limiting extracted text length. */
public class TestMaxLength extends ConverterPipelineTestBase {

  /** test text extraction with maxLength parameter. */
  protected String doTextExtraction(final String fileName, final Integer maxLength, final Boolean asHtml)
    throws Exception {
    final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
    if (maxLength != null) {
      additionalRecordParams.put(TikaPipelet.PROP_MAX_LENGTH, maxLength);
    }
    if (asHtml != null && asHtml) {
      additionalRecordParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, asHtml);
    }
    return executeTest(fileName, additionalRecordParams);
  }

  /** Test txt. */
  public void testTXT() throws Exception {
    final String fileName = "test.txt";
    String result = doTextExtraction(fileName, null, null);
    assertTrue(result.startsWith("SMILA"));
    result = doTextExtraction(fileName, 3, null);
    assertEquals("SMI", result);
  }

  /** Test utf-8 txt. */
  public void testUTF8TXT() throws Exception {
    final String fileName = "utf-8.txt";
    String result = doTextExtraction(fileName, null, null);
    assertTrue("was: " + result, result.startsWith("\u00ea SMILA"));
    result = doTextExtraction(fileName, 5, null);
    assertEquals("\u00ea SMI", result);
  }

  /** Test docx 2010. */
  public void testMSOFFICE2010DOCX() throws Exception {
    final String fileName = "SMILA_DOCX_2010.docx";
    String result = doTextExtraction(fileName, null, null);
    assertTrue("was: " + result, result.contains("SMILA"));
    result = doTextExtraction(fileName, 6, null);
    // we need maxLength=6 cause Tika will also extract 3 leading whitespaces
    assertFalse("was: " + result, result.contains("SMILA"));
    assertTrue("was: " + result, result.contains("SMI"));
    final String htmlResult = doTextExtraction(fileName, 11, true);
    // html tags are not taken into account for maxLength parameter,
    // nevertheless we have to set a higher maxLength parameter here.
    assertFalse("was: " + htmlResult, htmlResult.contains("SMILA"));
    assertTrue("was: " + htmlResult, htmlResult.contains("SMI"));
    assertTrue("was: " + htmlResult, htmlResult.contains("<html"));
    assertTrue(htmlResult.length() > result.length());
  }
}
