/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.extractor;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.taskworker.TaskLog;

/**
 * Simple {@link LinkExtractor} implementation using an HTML extractor.
 */
public class SimpleLinkExtractor implements LinkExtractor {

  /** default HTML extractor implementation to use. */
  private LinkExtractorHtml _extractorHtml = new LinkExtractorHtmlSoup();

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  @Override
  public Collection<Record> extractLinks(final Record inputRecord, final AnyMap parameters, final TaskLog taskLog)
    throws WebCrawlerException {
    final Collection<Record> result = new ArrayList<Record>();
    InputStream contentStream = null;
    final String baseUri = inputRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    try {
      final byte[] htmlContent = inputRecord.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
      contentStream = new ByteArrayInputStream(htmlContent);
      final Collection<String> links = _extractorHtml.extractLinks(contentStream, parameters);
      for (final String link : links) {
        final String absLink = getAbsoluteUri(baseUri, link);
        if (absLink != null) {
          final Record r = DataFactory.DEFAULT.createRecord();
          r.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, absLink);
          result.add(r);
        }
      }
      return result;
    } catch (final Exception e) {
      throw new WebCrawlerException("Error while extracting links from record with base URI " + baseUri);
    } finally {
      IOUtils.closeQuietly(contentStream);
    }
  }

  /** @return absolute URI from given URI by using base URI. */
  public String getAbsoluteUri(final String baseUri, final String uri) throws URIException {
    try {
      URI linkUri = new URI(uri.trim(), false); // 'false' because extracted link is not escaped 
      if (linkUri.isRelativeURI()) {
        if (baseUri == null) {
          return null;
        }
        linkUri = new URI(new URI(baseUri, true), linkUri); // 'true' because base URI is expected to be escaped
      }
      return linkUri.getEscapedURI(); // return escaped URI (could become new base URI in follow up task) 
    } catch (final Exception e) {
      _log.warn("Error while creating result uri from link '" + uri + "' and baseURI '" + baseUri + "'");
      return null;
    }
  }

  /** sets the HTML extractor implementation to use. */
  public void setLinkExtractorHtml(final LinkExtractorHtml linkExtractorHtml) {
    _extractorHtml = linkExtractorHtml;
  }

}
