/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.Date;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderElement;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;
import org.apache.commons.io.IOUtils;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.taskworker.TaskLog;

/**
 * Example implementation of a Fetcher service. It uses GET method to access the resource.
 * <ul>
 * <li>During crawling it reads metadata for content-length, content-type and last-modified from the HTTP header to
 * attributes and attaches the content of resources that are reported as mime type "text/html".
 * <li>During fetching it just attaches the content of any resource.
 * </ul>
 * It does not (yet) follow any redirects and does not (yet) support authentication. It is based on Apache HttpClient
 * 3.1.
 * 
 */
public class SimpleFetcher implements Fetcher {

  /** name of HTTP header for last-modified date. */
  private static final String HEADER_LASTMODIFIED = "Last-Modified";

  /** name of HTTP header for content-type and charset. */
  private static final String HEADER_CONTENTTYPE = "Content-Type";

  /** name of Content-type header parameter for charset. */
  private static final String HEADER_PARAM_CHARSET = "charset";

  /** default setttings for the connection manager. */
  private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32;

  /** default setttings for the connection manager. */
  private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128;

  /** client for all http operations. */
  private final HttpClient _httpClient;

  /** initialize HttpClient with disabled redirects. */
  public SimpleFetcher() {
    final HttpClientParams params = new HttpClientParams();
    params.setIntParameter(HttpClientParams.MAX_REDIRECTS, 0);

    final MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
    connectionManager.getParams().setDefaultMaxConnectionsPerHost(DEFAULT_MAX_CONNECTIONS_PER_HOST);
    connectionManager.getParams().setMaxTotalConnections(DEFAULT_MAX_TOTAL_CONNECTIONS);

    _httpClient = new HttpClient(params, connectionManager);
  }

  @Override
  public void crawl(final Record linkRecord, final AnyMap parameters, final TaskLog taskLog)
    throws WebCrawlerException {
    final AnyMap metadata = linkRecord.getMetadata();
    final String url = metadata.getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    GetMethod request = null;
    try {
      request = getResource(url, taskLog);
      readMetadata(metadata, request);
      readHtmlContent(linkRecord, request);
    } catch (final HttpException ex) {
      throw new WebCrawlerException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex,
        false);
    } catch (final IOException ex) {
      throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
    } finally {
      releaseQuietly(request, taskLog);
    }
  }

  @Override
  public void fetch(final Record crawledRecord, final AnyMap parameters, final TaskLog taskLog)
    throws WebCrawlerException {
    final AnyMap metadata = crawledRecord.getMetadata();
    final String url = metadata.getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    GetMethod request = null;
    try {
      request = getResource(url, taskLog);
      readContent(crawledRecord, request);
    } catch (final HttpException ex) {
      throw new WebCrawlerException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex,
        false);
    } catch (final IOException ex) {
      throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
    } finally {
      releaseQuietly(request, taskLog);
    }
  }

  /** create GET request to given resource, and return it if the response code was 200 (OK). */
  private GetMethod getResource(final String url, final TaskLog log) throws WebCrawlerException, IOException {
    final GetMethod request = new GetMethod(url);
    final int responseCode = _httpClient.executeMethod(request);
    if (responseCode != HttpStatus.SC_OK) {
      releaseQuietly(request, log);
      throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + ".");
    }
    return request;
  }

  /** extract metadata from HTTP response. */
  private void readMetadata(final AnyMap metadata, final GetMethod request) {
    metadata.put(WebCrawlerConstants.ATTRIBUTE_SIZE, request.getResponseContentLength());
    final Header contentType = request.getResponseHeader(HEADER_CONTENTTYPE);
    if (contentType != null) {
      metadata.put(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE, contentType.getValue());
      final HeaderElement[] elements = contentType.getElements();
      if (elements.length > 0) {
        final String mimetype = elements[0].getName();
        if (mimetype != null) {
          metadata.put(WebCrawlerConstants.ATTRIBUTE_MIMETYPE, mimetype);
        }
        final NameValuePair charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET);
        if (charset != null) {
          metadata.put(WebCrawlerConstants.ATTRIBUTE_CHARSET, charset.getValue());
        }
      }
    }
    final Header date = request.getResponseHeader(HEADER_LASTMODIFIED);
    if (date != null) {
      try {
        final Date parsedDate = DateUtil.parseDate(date.getValue());
        metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED,
          metadata.getFactory().createDateTimeValue(parsedDate));
      } catch (final DateParseException ex) {
        metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, date.getValue());
      }
    }
  }

  /** get content from response, if it is HTML. */
  private void readHtmlContent(final Record record, final GetMethod request) throws IOException {
    final String mimetype = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE);
    if (mimetype != null && mimetype.equals("text/html")) {
      readContent(record, request);
    }
  }

  /**
   * get content from response, regardless of mimetype. If content could be read and size attribute is not set or
   * negative, adapt it to the actual size of the content.
   */
  private void readContent(final Record record, final GetMethod request) throws IOException {
    final InputStream contentStream = request.getResponseBodyAsStream();
    // stream will be closed in releaseQuietly
    final byte[] content = IOUtils.toByteArray(contentStream);
    if (content != null) {
      record.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, content);
      final Long size = record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE);
      if (size == null || size < 0) {
        record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_SIZE, content.length);
      }
    }
  }

  /** close the connection without throwing exceptions. */
  private void releaseQuietly(final HttpMethod request, final TaskLog log) {
    if (request != null) {
      try {
        consumeContent(request, log);
        request.releaseConnection();
      } catch (final RuntimeException ex) {
        log.info("Error releasing connection after Http request done.", ex);
      }
    }
  }

  /** read everything from the response body of the request so that it can be released correctly. */
  private void consumeContent(final HttpMethod request, final TaskLog log) {
    InputStream content = null;
    try {
      content = request.getResponseBodyAsStream();
      if (content != null) {
        IOUtils.skip(content, Long.MAX_VALUE);
      }
    } catch (final Exception ex) {
      log.info("Error consuming the resource stream", ex);
    } finally {
      IOUtils.closeQuietly(content);
    }
  }
}
