/**********************************************************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
 * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
 * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Schank (Empolis Information Management GmbH) - initial implementation
 **********************************************************************************************************************/
package org.eclipse.smila.tika;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.Blackboard.Get;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.common.mimetype.MimeTypeIdentifier;
import org.eclipse.smila.common.mimetype.MimeTypeParseException;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
import org.eclipse.smila.processing.util.ProcessingConstants;
import org.eclipse.smila.processing.util.ResultCollector;
import org.eclipse.smila.tika.internal.PageBreakWriteOutContentHandler;
import org.eclipse.smila.tika.internal.TemporaryResourcesHelper;
import org.eclipse.smila.utils.service.ServiceUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * The TikaPipelet can be used for converting various document formats to text and extracting metadata.
 */
public class TikaPipelet extends ATransformationPipelet {

  /**
   * (Optional) parameter referencing the attribute that contains the content type (e.g. "text/plain") of the attachment
   * content. The parameter (resp. attribute) may not be set (null/empty) and then a content type detection is
   * performed.
   */
  public static final String PROP_ATTACHMENT_CONTENT_TYPE_ATTRIBUTE = "contentTypeAttribute";

  /**
   * (Optional) parameter referencing the attribute that contains the file name that can give the Tika parser possibly a
   * hint about how to parse the file.
   */
  public static final String PROP_FILE_NAME_ATTRIBUTE = "fileNameAttribute";

  /**
   * (Optional) Parameter that defines what to extract from input and copy into record attributes with the name of the
   * extracted properties. This is a list of maps or one single map (if only one mapping is needed) describing the
   * mapping of one metadata attribute, each. The metadata keys will be matched case-insensitively.
   */
  public static final String PROP_EXTRACT_PROPERTIES = "extractProperties";

  /**
   * (Optional) parameter that defines if the content should be transformed to (X)HTML (true) or plain text (false).
   * Default: false.
   */
  public static final String PROP_EXPORT_AS_HTML = "exportAsHtml";

  /**
   * (Optional) parameter that defines whether page breaks should be marked with a <pagebreak/> in the result. Default:
   * false.
   */
  public static final String PROP_PAGE_BREAK = "pageBreak";

  /**
   * (Optional) parameter referencing the attribute that that contains the page number. This is only evaluated if
   * parameter page breaks is set to true. If not specified the attribute is not set. Default: not set
   */
  public static final String PROP_PAGE_NUMBER_ATTRIBUTE = "pageNumberAttribute";

  /**
   * (Optional) If the parameter is set, the content is split into multiple parts using page-breaks and the parts,
   * represented by a sequence of maps, are stored to an output record. The parameter defines the key for this sequence.
   * Default: not set. The parameter is evaluated only if the page break parameter is set to true.
   */
  public static final String PROP_PARTS_ATTRIBUTE = "partsAttribute";

  /**
   * (Optional) parameter that defines whether the hyphens should be kept in the output (as in the input) (true) or
   * whether the software should try to remove the hyphens by a heuristic approach. Default: true.
   */
  public static final String PROP_KEEP_HYPHENS = "keepHyphens";

  /**
   * (Optional) parameter that defines how many characters of the content should be extracted to prevent out of memory
   * leaks. Defaults to -1 ("unlimited").
   */
  public static final String PROP_MAX_LENGTH = "maxLength";

  /** default length. -1 is "unlimited". */
  public final static int DEFAULT_MAX_LENGTH = -1;

  /** name of the metadata field (will be matched case-insensitively). */
  public static final String PROP_MAPPING_METADATA_NAME = "metadataName";

  /**
   * name of the target attribute for the metadata entry (optional, default: the value of
   * {@link TikaPipelet#PROP_MAPPING_METADATA_NAME} with its original case.
   */
  public static final String PROP_MAPPING_TARGET_ATTRIBUTE = "targetAttribute";

  /** will only one result (true) multiple, if available (false) be considered. Default: false. */
  public static final String PROP_MAPPING_SINGLE_RESULT = "singleResult";

  /** (Optional) parameter that defines how the extracted properties are stored in the target attribute. */
  public static final String PROP_MAPPING_STORE_MODE = "storeMode";

  /** separator used to create the record id of records split when pageBreak=true. */
  public static final String SPLIT_ID_SEPARATOR = "###";

  /** possible values for parameter 'storeMode'. */
  public enum StoreMode {
    leave, overwrite, add
  }

  /** Page start tag after the content was exported as (X)HTML. */
  public static final String DIV_PAGE_START_TAG = "<div class=\"page\">";

  /** Page end tag after the content was exported as (X)HTML. */
  public static final String DIV_PAGE_END_TAG = "</div>";

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  private Parser _parser;

  private MimeTypeIdentifier _identifier;

  /** {@inheritDoc} */
  @Override
  public void configure(final AnyMap configuration) throws ProcessingException {
    super.configure(configuration);
  }

  /** {@inheritDoc} */
  @Override
  public String[] process(final Blackboard blackboard, final String[] recordIds) throws ProcessingException {
    final ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, _config);
    final ResultCollector resultCollector =
      new ResultCollector(paramAccessor, _log, ProcessingConstants.DROP_ON_ERROR_DEFAULT);
    // Classloading Workaround:
    // Some depedency bundles of Tika (e.g. Rome) use the TCCL to initialize. By setting the TCCL to the classloader of
    // this bundle, we can add such these dependency bundles as Required-Bundle to the manifest of this pipelet bundle
    // to make it work.
    final ClassLoader tcclBackup = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
    try {
      for (final String id : recordIds) {
        // read runtime parameters
        paramAccessor.setCurrentRecord(id);
        try (final InputStream dataStream = getInputStream(blackboard, id, paramAccessor);) {
          if (dataStream != null && dataStream.available() > 0) {
            final List<String> resultIds = convertAndExtract(blackboard, id, dataStream, paramAccessor);
            for (final String resultId : resultIds) {
              resultCollector.addResult(resultId);
            }
          } else {
            resultCollector.addResult(id);
          }
        } catch (final Exception e) {
          resultCollector.addFailedResult(id, e);
        }
      }
    } finally {
      Thread.currentThread().setContextClassLoader(tcclBackup);
    }
    return resultCollector.getResultIds();
  }

  /** reads and prepares the extract properties (keys will be made case-insensitive). */
  private AnyMap readAndPrepareExtractProperties(final Any extractPropsAny) {
    final AnyMap result = DataFactory.DEFAULT.createAnyMap();
    if (extractPropsAny != null) {
      for (final Any mappingEntry : extractPropsAny) {
        if (mappingEntry.isMap()) {
          final AnyMap newEntry = DataFactory.DEFAULT.cloneAnyMap(mappingEntry.asMap());
          // create default target with original name
          if (!newEntry.containsKey(PROP_MAPPING_TARGET_ATTRIBUTE)) {
            newEntry.put(PROP_MAPPING_TARGET_ATTRIBUTE, newEntry.getStringValue(PROP_MAPPING_METADATA_NAME));
          }
          final String newName = newEntry.getStringValue(PROP_MAPPING_METADATA_NAME).toLowerCase(Locale.ENGLISH);
          newEntry.put(PROP_MAPPING_METADATA_NAME, newName);
          result.put(newName, newEntry);
        } else {
          _log.error("TikaPipelet configuration " + PROP_EXTRACT_PROPERTIES
            + " must contain maps describing the mapping.");
        }
      }
    }
    return result;
  }

  /**
   * Converts the dataStream contained in the record, extracts plain or HTML text, eventually inserts pagebreaks,
   * removes hyphens, extracts metadata and stores the result in the record.
   */
  private List<String> convertAndExtract(final Blackboard blackboard, final String id,
    final InputStream dataStream, final ParameterAccessor paramAccessor) throws TransformerConfigurationException,
    TikaException, ProcessingException, SAXException, BlackboardAccessException, IOException,
    MimeTypeParseException {
    // read parameters from config or input record
    final String contentTypeAttribute = paramAccessor.getParameter(PROP_ATTACHMENT_CONTENT_TYPE_ATTRIBUTE, null);
    final String fileNameAttribute = paramAccessor.getParameter(PROP_FILE_NAME_ATTRIBUTE, null);
    final String pageNumberAttribute = paramAccessor.getParameter(PROP_PAGE_NUMBER_ATTRIBUTE, null);
    final boolean exportAsHtml = paramAccessor.getBooleanParameter(PROP_EXPORT_AS_HTML, false);
    final boolean pageBreak = paramAccessor.getBooleanParameter(PROP_PAGE_BREAK, false);
    final boolean keepHyphens = paramAccessor.getBooleanParameter(PROP_KEEP_HYPHENS, false);
    final int maxLength = paramAccessor.getIntParameter(PROP_MAX_LENGTH, DEFAULT_MAX_LENGTH);
    final Any extractPropsAny = paramAccessor.getParameterAny(PROP_EXTRACT_PROPERTIES);
    final AnyMap extractProperties = readAndPrepareExtractProperties(extractPropsAny);
    final String inputFileName = blackboard.getMetadata(id).getStringValue(fileNameAttribute);
    final String inputContentType = blackboard.getMetadata(id).getStringValue(contentTypeAttribute);

    final Metadata metadata = new Metadata();
    if (inputFileName != null && !inputFileName.isEmpty()) {
      metadata.set(Metadata.RESOURCE_NAME_KEY, inputFileName);
    }

    try {
      if (_log.isDebugEnabled()) {
        _log.debug("Start parsing document " + id + ".");
      }
      final Writer writer = new StringWriter();
      final WriteOutContentHandler writeOutHandler =
        createWriteOutHandler(writer, exportAsHtml, pageBreak, maxLength);
      final ContentHandler contentHandler = createContentHandler(writeOutHandler, exportAsHtml);

      final TemporaryResources tmp = new TemporaryResources();
      try {
        final Parser p = getParser();
        final ParseContext context = new ParseContext();
        context.set(Parser.class, p);

        final InputStream tis = TikaInputStream.get(dataStream, tmp);

        final String contentType = determineContentType(inputContentType, inputFileName, tis);
        metadata.set(Metadata.CONTENT_TYPE, contentType);
        checkIfContentTypeIsSupported(id, p, context, metadata.get(Metadata.CONTENT_TYPE));

        p.parse(tis, contentHandler, metadata, context);

        handleMetadata(blackboard, id, contentTypeAttribute, extractProperties, metadata);
        return handleParseResult(blackboard, id, paramAccessor, pageBreak, pageNumberAttribute, keepHyphens, writer);
      } catch (final SAXException e) {
        // check if size limit was reached and return limited data
        if (writeOutHandler.isWriteLimitReached(e)) {
          handleMetadata(blackboard, id, contentTypeAttribute, extractProperties, metadata);
          return handleParseResult(blackboard, id, paramAccessor, pageBreak, pageNumberAttribute, keepHyphens,
            writer);
        } else {
          throw e;
        }
      } finally {
        TemporaryResourcesHelper.disposeQuietly(tmp);
      }
    } catch (final NoClassDefFoundError e) {
      _log.warn("Could not find java class needed to parse document " + id + ".", e);
      throw new IOException(e);
    } catch (final Throwable t) {
      _log.warn("Could not parse document " + id + ".", t);
      throw new IOException(t);
    } finally {
      _log.debug("Parsing with Tika done.");
    }
  }

  private WriteOutContentHandler createWriteOutHandler(final Writer writer, final boolean exportAsHtml,
    final boolean pageBreak, final int maxLength) throws TransformerFactoryConfigurationError,
    TransformerConfigurationException {
    WriteOutContentHandler writeOutHandler = null;
    if (exportAsHtml) {
      writeOutHandler = createHtmlExtractWriteOutHandler(maxLength, writer);
    } else if (pageBreak) {
      writeOutHandler = new PageBreakWriteOutContentHandler(writer, maxLength);
    } else {
      writeOutHandler = new WriteOutContentHandler(writer, maxLength);
    }
    return writeOutHandler;
  }

  private ContentHandler createContentHandler(final WriteOutContentHandler writeOutHandler,
    final boolean exportAsHtml) {
    ContentHandler contentHandler = null;
    if (exportAsHtml) {
      contentHandler = writeOutHandler;
    } else {
      contentHandler = new BodyContentHandler(writeOutHandler);
    }
    return contentHandler;
  }

  private String determineContentType(final String inputContentType, final String inputFileName,
    final InputStream tis) throws ProcessingException, MimeTypeParseException, IOException {
    if (inputContentType != null && !inputContentType.isEmpty()) {
      return inputContentType;
    }
    final MimeTypeIdentifier d = getMimeTypeIdentifier();
    try {
      return d.identify(tis, getExtension(inputFileName));
    } finally {
      tis.reset();
    }
  }

  private void checkIfContentTypeIsSupported(final String id, final Parser p, final ParseContext context,
    final String contentType) {
    boolean isContentTypeSupported = false;
    if (contentType != null) {
      final String[] contentTypeParts = StringUtils.split(contentType, '/');
      if (contentTypeParts.length == 2) {
        final String type = contentTypeParts[0];
        String subtype = contentTypeParts[1];
        final int parametersStart = subtype.indexOf(';');
        if (parametersStart > 0) {
          subtype = subtype.substring(0, parametersStart);
        }
        final MediaType mtContentType = new MediaType(type, subtype);
        isContentTypeSupported = p.getSupportedTypes(context).contains(mtContentType);
      }
    }
    if (!isContentTypeSupported) {
      _log.warn("Content type '" + contentType + "' is not supported. Document '" + id
        + "' may not be processed properly.");
    }
  }

  private List<String> handleParseResult(final Blackboard blackboard, final String id,
    final ParameterAccessor paramAccessor, final boolean pageBreak, final String pageNumberAttribute,
    final boolean keepHyphens, final Writer writer) throws BlackboardAccessException, ProcessingException {
    if (pageBreak) {
      final String partsAttribute = paramAccessor.getParameter(PROP_PARTS_ATTRIBUTE, "");
      if (!partsAttribute.isEmpty()) {
        return splitOnPageBreaksAsParts(blackboard, id, partsAttribute, pageNumberAttribute,
          handleExtractedText(keepHyphens, writer.toString()), paramAccessor);
      } else {
        return splitOnPageBreaks(blackboard, id, pageNumberAttribute,
          handleExtractedText(keepHyphens, writer.toString()), paramAccessor);
      }
    } else {
      storeResult(blackboard, id, handleExtractedText(keepHyphens, writer.toString()), paramAccessor);
      return Collections.singletonList(id);
    }
  }

  private String getExtension(final String inputFileName) {
    if (inputFileName != null) {
      final int dotIndex = inputFileName.lastIndexOf('.');
      if (dotIndex >= 0 && dotIndex < inputFileName.length() - 1) {
        return inputFileName.substring(dotIndex + 1);
      }
    }
    return null;
  }

  /** Creates and returns a {@link WriteOutContentHandler} suited for extracting HTML text. */
  private WriteOutContentHandler createHtmlExtractWriteOutHandler(final int maxLength, final Writer writer)
    throws TransformerFactoryConfigurationError, TransformerConfigurationException {
    WriteOutContentHandler writeOutHandler;
    final SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
    final TransformerHandler transformerHandler = factory.newTransformerHandler();
    transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
    transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
    transformerHandler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
    transformerHandler.getTransformer().setOutputProperty("{http://xml.apache.org/xalan}line-separator", "\n");
    transformerHandler.setResult(new StreamResult(writer));
    writeOutHandler = new WriteOutContentHandler(transformerHandler, maxLength);
    return writeOutHandler;
  }

  /** extract metadata according to extract configuration, also fills the contentType-Attribute, if set. */
  private void handleMetadata(final Blackboard blackboard, final String id, final String contentTypeAttribute,
    final AnyMap extractProperties, final Metadata metadata) throws BlackboardAccessException {
    // fill the contentType attribute
    if (contentTypeAttribute != null) {
      blackboard.getMetadata(id).put(contentTypeAttribute, metadata.get(Metadata.CONTENT_TYPE));
    }

    mapMetadata(blackboard, id, extractProperties, metadata);
  }

  /** maps the metadata according to the mapping configuration. */
  private void mapMetadata(final Blackboard blackboard, final String id, final AnyMap extractProperties,
    final Metadata metadata) throws BlackboardAccessException {
    // iterate over the metadata and check if we've got a mapping defined (case-insensitive!)
    for (final String metaDataKey : metadata.names()) {
      final AnyMap mappingEntry = extractProperties.getMap(metaDataKey.toLowerCase(Locale.ENGLISH));
      if (mappingEntry != null) {
        final boolean multiResultAllowed =
          !mappingEntry.containsKey(PROP_MAPPING_SINGLE_RESULT)
            || !mappingEntry.getBooleanValue(PROP_MAPPING_SINGLE_RESULT);
        StoreMode storeMode = StoreMode.add;
        if (mappingEntry.containsKey(PROP_MAPPING_STORE_MODE)) {
          storeMode = StoreMode.valueOf(mappingEntry.getStringValue(PROP_MAPPING_STORE_MODE));
        }
        Any value;
        if (metadata.isMultiValued(metaDataKey) && multiResultAllowed) {
          value = DataFactory.DEFAULT.createAnySeq();
          for (final String metaDataStringValue : metadata.getValues(metaDataKey)) {
            value.asSeq().add(metaDataStringValue);
          }
        } else {
          value = DataFactory.DEFAULT.createStringValue(metadata.get(metaDataKey));
        }

        final String targetAttribute = mappingEntry.getStringValue(PROP_MAPPING_TARGET_ATTRIBUTE);
        if (storeMode == StoreMode.overwrite) {
          blackboard.getMetadata(id).put(targetAttribute, value);
        } else if (storeMode == StoreMode.add) {
          if (value.isSeq()) {
            for (final Any v : value.asSeq()) {
              blackboard.getMetadata(id).add(targetAttribute, v);
            }
          } else if (value.isValue()) {
            if (blackboard.getMetadata(id).get(targetAttribute) == null) {
              blackboard.getMetadata(id).put(targetAttribute, value);
            } else {
              blackboard.getMetadata(id).add(targetAttribute, value);
            }
          }
        } // else: leave old value
      }
    }
  }

  /** trim result and remove hyphens if keepHyphens is false. */
  private String handleExtractedText(final boolean keepHyphens, final String inputText) {
    // hint: trim() will not cause a memory duplication of the extracted text cause it's only a substring (=view) of the
    // original input string
    if (keepHyphens) {
      return inputText.trim();
    } else {
      // remove Hyphens at end of line, but only between two letters
      // these letters are checked with zero-width positive lookahead/lookbehind
      // EOLs can be line-feed, carriage-return, next-line, line-separator, paragraph-separator
      return inputText.trim().replaceAll("(?<=\\p{L})- ?(?:[\r\n\u0085\u2028\u2029]+)(?=\\p{L})", "");
    }
  }

  /**
   * @return a Parser service.
   * 
   * @throws ProcessingException
   *           could not find a service
   */
  private synchronized Parser getParser() throws ProcessingException {
    if (_parser == null) {
      try {
        _parser = ServiceUtils.getService(Parser.class);
      } catch (final Exception ex) {
        _log.warn("Error while waiting for Tika Parser service to come up.", ex);
      }
      if (_parser == null) {
        throw new ProcessingException("No Tika Parser service available, giving up");
      }
    }
    return _parser;
  }

  /**
   * @return a {@link MimeTypeIdentifier} service.
   * 
   * @throws ProcessingException
   *           could not find a service
   */
  private synchronized MimeTypeIdentifier getMimeTypeIdentifier() throws ProcessingException {
    if (_identifier == null) {
      try {
        _identifier = ServiceUtils.getService(MimeTypeIdentifier.class);
      } catch (final Exception ex) {
        _log.warn("Error while waiting for MimeTypeIdentifier service to come up.", ex);
      }
      if (_identifier == null) {
        throw new ProcessingException("No MimeTypeIdentifier service available, giving up");
      }
    }
    return _identifier;
  }

  private ArrayList<String> splitOnPageBreaks(final Blackboard blackboard, final String id,
    final String pageNumberAttribute, final String text, final ParameterAccessor paramAccessor)
    throws ProcessingException, BlackboardAccessException {
    final ArrayList<String> ids = new ArrayList<String>();
    final boolean exportAsHtml = paramAccessor.getBooleanParameter(PROP_EXPORT_AS_HTML, false);
    final String[] pages = splitTextToPages(text, exportAsHtml);
    if (pages.length <= 1) {
      storeResult(blackboard, id, text, paramAccessor);
      ids.add(id);
    } else {
      for (int i = 0; i < pages.length; i++) {
        final int page = i + 1;
        final String generatedid = id + SPLIT_ID_SEPARATOR + page;
        final Record record = blackboard.getRecord(generatedid, Get.NEW);
        final AnyMap metaData = DataFactory.DEFAULT.cloneAnyMap(blackboard.getMetadata(id));
        metaData.remove(Record.RECORD_ID);
        if (pageNumberAttribute != null) {
          metaData.put(pageNumberAttribute, page);
        }
        record.getMetadata().putAll(metaData);
        storeResult(blackboard, generatedid, pages[i], paramAccessor);
        ids.add(generatedid);
      }
    }
    return ids;
  }

  private ArrayList<String> splitOnPageBreaksAsParts(final Blackboard blackboard, final String id,
    final String partsAttribute, final String pageNumberAttribute, final String text,
    final ParameterAccessor paramAccessor) throws ProcessingException, BlackboardAccessException {
    final String outputName = getOutputName(paramAccessor);
    final boolean exportAsHtml = paramAccessor.getBooleanParameter(PROP_EXPORT_AS_HTML, false);
    final String[] pages = splitTextToPages(text, exportAsHtml);
    final AnySeq parts = DataFactory.DEFAULT.createAnySeq();
    for (int i = 0; i < pages.length; i++) {
      final int pageNum = i + 1;
      final AnyMap part = DataFactory.DEFAULT.createAnyMap();
      part.put(outputName, pages[i]);
      if (pageNumberAttribute != null) {
        part.put(pageNumberAttribute, pageNum);
      }
      parts.add(part);
    }
    blackboard.getMetadata(id).put(partsAttribute, parts);
    final ArrayList<String> ids = new ArrayList<String>();
    ids.add(id);
    return ids;
  }

  private String[] splitTextToPages(final String text, final boolean exportAsHtml) {
    final String pageStartTag = exportAsHtml ? DIV_PAGE_START_TAG : PageBreakWriteOutContentHandler.PAGE_START_TAG;
    final String pageEndTag = exportAsHtml ? DIV_PAGE_END_TAG : PageBreakWriteOutContentHandler.PAGE_END_TAG;
    final String[] pages = StringUtils.substringsBetween(text, pageStartTag, pageEndTag);
    if (pages == null) { // no pages at all? -> pageBreak not supported on this document type, return as one page
      return new String[] { text };
    }
    return pages;
  }
}
