/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.extractor;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.taskworker.TaskLogWarn;

/**
 * Simple {@link LinkExtractor} implementation using an HTML extractor.
 */
public class DefaultLinkExtractor implements LinkExtractor {

  /** default HTML extractor implementation to use. */
  private LinkExtractorHtml _extractorHtml = new LinkExtractorHtmlSoup();

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  @Override
  public Collection<Record> extractLinks(final Record inputRecord, final WebCrawlingContext context)
    throws WebCrawlerException {
    final Collection<Record> result = new ArrayList<Record>();
    InputStream contentStream = null;
    final String baseUri = inputRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    try {
      final byte[] htmlContent = inputRecord.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
      contentStream = new ByteArrayInputStream(htmlContent);
      final Collection<String> links =
        _extractorHtml.extractLinks(contentStream, context.getTaskParameters(),
          new TaskLogWarn(context.getTaskLog()));
      for (final String link : links) {
        final String normalizedLink = normalizeLink(baseUri, link);
        if (normalizedLink != null) {
          final Record r = DataFactory.DEFAULT.createRecord();
          r.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, normalizedLink);
          result.add(r);
        } else {
          _log.info("Couldn't normalize link: " + link);
        }
      }
      return result;
    } catch (final Exception e) {
      throw new WebCrawlerException("Error while extracting links from record with base URI " + baseUri, e);
    } finally {
      IOUtils.closeQuietly(contentStream);
    }
  }

  /** @return normalized URL. */
  private String normalizeLink(final String baseUri, final String link) {
    try {
      return UriHelper.normalizeUrl(baseUri, link);
    } catch (final Exception e) {
      _log.warn("Error while normalizing link '" + link + "': " + e.getLocalizedMessage());
      return null;
    }
  }

  /** sets the HTML extractor implementation to use. */
  public void setLinkExtractorHtml(final LinkExtractorHtml linkExtractorHtml) {
    _extractorHtml = linkExtractorHtml;
  }

}
