/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.ProxySelector;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Date;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.HttpClientParams;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.AbstractHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
import org.apache.http.impl.cookie.DateParseException;
import org.apache.http.impl.cookie.DateUtils;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.http.client.util.HttpClientUtil;
import org.eclipse.smila.importing.ImportingException;
import org.eclipse.smila.importing.VisitedLinksException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.importing.crawler.web.utils.WebCrawlerConfiguration;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;
import org.eclipse.smila.taskworker.TaskContext;

/**
 * Example implementation of a Fetcher service. It uses GET method to access the resource.
 * <ul>
 * <li>During crawling it reads metadata for content-length, content-type and last-modified from the HTTP header to
 * attributes and attaches the content of resources that are reported as mime type "text/html".
 * <li>During fetching it just attaches the content of any resource.
 * </ul>
 * It does not (yet) support authentication. It is based on Apache HttpClient 4.1.
 * 
 */
public class DefaultFetcher implements Fetcher {

  /** name of HTTP header for last-modified date. */
  private static final String HEADER_LASTMODIFIED = "Last-Modified";

  /** name of HTTP header for content-type and charset. */
  private static final String HEADER_CONTENTTYPE = "Content-Type";

  /** name of Content-type header parameter for charset. */
  private static final String HEADER_PARAM_CHARSET = "charset";

  /** default setttings for the connection manager. */
  private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32;

  /** default setttings for the connection manager. */
  private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128;

  /** reference to VisitedLinks service. */
  private VisitedLinksService _visitedLinks;

  /** reference to LinkFilter service. */
  private LinkFilter _linkFilter;

  /** reference to JobRunDataProvider service. */
  private JobRunDataProvider _jobRunDataProvider;

  /** client for all http operations. */
  private final HttpClient _httpClient;

  /** content of webcrawler.properties file. */
  private final WebCrawlerConfiguration _configuration;

  /** local logger. */
  private final Log _log = LogFactory.getLog(getClass());

  /** initialize HttpClient with disabled redirects. */
  public DefaultFetcher() {
    _configuration = new WebCrawlerConfiguration();
    _httpClient = createAndConfigureClient();
  }

  /** Reads proxy configuration from config file and sets the proxy configuration accordingly. */
  private HttpClient createAndConfigureClient() {
    final ClientConnectionManager connectionManager =
      HttpClientUtil.createThreadSafeConnectionManager(DEFAULT_MAX_TOTAL_CONNECTIONS,
        DEFAULT_MAX_CONNECTIONS_PER_HOST);
    final HttpClient httpClient = new DefaultHttpClient(connectionManager);
    final HttpParams params = httpClient.getParams();
    HttpClientParams.setRedirecting(params, false);
    HttpProtocolParams.setUserAgent(params, _configuration.getUserAgent());
    final HttpHost proxyHost = _configuration.getProxyHost();
    if (proxyHost != null) {
      ConnRouteParams.setDefaultProxy(params, proxyHost);
    } else {
      ((AbstractHttpClient) httpClient).setRoutePlanner(new ProxySelectorRoutePlanner(httpClient
        .getConnectionManager().getSchemeRegistry(), ProxySelector.getDefault()));
    }
    final Integer socketTimeout = _configuration.getSocketTimeout();
    if (socketTimeout > 0) {
      HttpConnectionParams.setSoTimeout(params, socketTimeout);
    }
    return httpClient;
  }

  @Override
  public void crawl(final String url, final Record linkRecord, final WebCrawlingContext context)
    throws WebCrawlerException {
    try (HttpResponseInputStream response = getResource(url, context, true, true)) {
      resetUrlAttributeOnRedirect(linkRecord, response, context.getMapper());
      readMetadata(linkRecord, response);
      readHtmlContent(linkRecord, response);
    } catch (final RedirectException ex) {
      throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
        + ex.getMessage(), ex, false);
    } catch (final VisitedLinksException ex) {
      throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
        + ex.getMessage(), ex);
    } catch (final IOException ex) {
      throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex);
    } catch (final RuntimeException ex) {
      throw new WebCrawlerException("Error while getting web resource " + url + ": " + ex.getMessage(), ex, false);
    }
  }

  @Override
  public void fetch(final String url, final Record crawledRecord, final WebCrawlingContext context)
    throws WebCrawlerException {
    try (HttpResponseInputStream response = getResource(url, context, false, false)) {
      resetUrlAttributeOnRedirect(crawledRecord, response, context.getMapper());
      readMetadata(crawledRecord, response);
      readContent(crawledRecord, response);
    } catch (final RedirectException ex) {
      throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
        + ex.getMessage(), ex, false);
    } catch (final VisitedLinksException ex) {
      throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": "
        + ex.getMessage(), ex);
    } catch (final IOException ex) {
      throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex);
    } catch (final RuntimeException ex) {
      throw new WebCrawlerException("Error while getting web resource " + url + ": " + ex.getMessage(), ex, false);
    }
  }

  /**
   * {@inheritDoc}
   * 
   * <p>
   * <b>Please note: a mapped record (at least URL must be mapped) is expected here!</b>
   * </p>
   */
  @Override
  public InputStream getContent(final Record crawledRecord, final TaskContext taskContext)
    throws ImportingException {
    final PropertyNameMapper mapper = PropertyNameMapper.createFrom(taskContext);
    String url = crawledRecord.getMetadata().getStringValue(mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0));
    if (url == null) {
      url = crawledRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    }
    try {
      final HttpResponseInputStream response = getResource(url, new WebCrawlingContext(taskContext), false, false);
      resetUrlAttributeOnRedirect(crawledRecord, response, mapper);
      return response;
    } catch (final RedirectException ex) {
      throw new ImportingException("Error while handling redirects for web resource " + url + ": "
        + ex.getMessage(), ex, false);
    } catch (final VisitedLinksException ex) {
      throw new ImportingException("Error while handling redirects for web resource " + url + ": "
        + ex.getMessage(), ex, true);
    } catch (final IOException ex) {
      throw new ImportingException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex, true);
    } catch (final Exception ex) {
      throw new ImportingException("Http error while getting web resource " + url + ": " + ex.getMessage(), ex,
        false);
    }
  }

  private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context,
    final boolean checkRobotsTxt, final boolean checkVisitedLinksInRedirects) throws WebCrawlerException,
    VisitedLinksException, RedirectException, IOException {
    return getResource(url, context, checkRobotsTxt, checkVisitedLinksInRedirects, 0);
  }

  /** create GET request to given resource, and return it if the response code was 200 (OK). */
  private HttpResponseInputStream getResource(final String url, final WebCrawlingContext context,
    final boolean checkRobotsTxt, final boolean checkVisitedLinksInRedirects, final int redirectLevel)
    throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
    if (checkRobotsTxt && disallowedByRobotsTxt(url, context)) {
      throw new WebCrawlerException("Crawling " + url + " is not allowed by robots.txt", false);
    }
    final FilterConfiguration filterConfig = context.getFilterConfiguration();
    final HttpGet request = new HttpGet(url);
    final HttpResponse response = _httpClient.execute(request);
    final HttpResponseInputStream responseStream = new HttpResponseInputStream(url, response, redirectLevel > 0);
    final int responseCode = response.getStatusLine().getStatusCode();
    if (responseCode == HttpStatus.SC_OK) {
      return responseStream;
    } else if (isRedirect(responseCode)) {
      if (filterConfig != null && filterConfig.followRedirects()) {
        return handleRedirects(responseStream, context, checkRobotsTxt, checkVisitedLinksInRedirects,
          redirectLevel, url);
      } else {
        IOUtils.closeQuietly(responseStream);
        throw new RedirectException("Follow redirects not configured, skipping link " + url);
      }
    } else {
      IOUtils.closeQuietly(responseStream);
      throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + ".");
    }
  }

  private boolean isRedirect(final int statusCode) {
    switch (statusCode) {
      case HttpStatus.SC_MOVED_PERMANENTLY:
      case HttpStatus.SC_MOVED_TEMPORARILY:
      case HttpStatus.SC_SEE_OTHER:
      case HttpStatus.SC_TEMPORARY_REDIRECT:
        return true;
      default:
        return false;
    }
  }

  private HttpResponseInputStream handleRedirects(final HttpResponseInputStream responseStream,
    final WebCrawlingContext context, final boolean checkRobotsTxt, final boolean checkVisitedLinksInRedirects,
    int redirectLevel, final String originalUrl) throws WebCrawlerException, VisitedLinksException, IOException,
    RedirectException {
    try {
      if (redirectLevel >= context.getFilterConfiguration().getMaxRedirects()) {
        throw new RedirectException("Reached maximum number of redirects");
      }

      // get the location header to find out where to redirect to
      final HttpResponse response = responseStream.getResponse();
      final Header locationHeader = response.getFirstHeader("location");
      if (locationHeader == null) {
        throw new RedirectException("Received redirect response " + response.getStatusLine()
          + " but no location header");
      }

      final String location = locationHeader.getValue();
      try {
        final String redirectUrl = UriHelper.normalizeUrl(responseStream.getUrl(), location);
        if (redirectUrl == null) {
          throw new RedirectException("Couldn't create absolute link from baseUri " + responseStream.getUrl()
            + " and link " + location);
        }
        if (_linkFilter.allowRedirectLink(redirectUrl, originalUrl, context)) {
          if (!checkVisitedLinksInRedirects || !checkIfLinkIsVisited(context, redirectUrl)) {
            context.getVisitedUrls().add(redirectUrl);
            return getResource(redirectUrl, context, checkRobotsTxt, checkVisitedLinksInRedirects, ++redirectLevel);
          } else {
            throw new RedirectException("Redirect to URL '" + redirectUrl
              + "' is not allowed. URL was already visited");
          }
        } else {
          throw new RedirectException("Redirect to URL '" + redirectUrl
            + "' is not allowed by filter configuration");
        }
      } catch (final URISyntaxException ex) {
        throw new RedirectException("Invalid Redirect location '" + location + "'", ex);
      }
    } finally {
      IOUtils.closeQuietly(responseStream);
    }
  }

  private boolean checkIfLinkIsVisited(final WebCrawlingContext context, final String redirectUrl)
    throws VisitedLinksException {
    return context.getVisitedUrls().contains(redirectUrl)
      || _visitedLinks.checkAndMarkVisited(context.getDataSource(), redirectUrl, context.getJobRunId(),
        context.getCurrentInputBulkId());
  }

  private void resetUrlAttributeOnRedirect(final Record record, final HttpResponseInputStream response,
    final PropertyNameMapper mapper) {
    if (response.isRedirect()) {
      for (final String attributeName : mapper.get(WebCrawlerConstants.ATTRIBUTE_URL)) {
        if (record.getMetadata().containsKey(attributeName)) {
          record.getMetadata().put(attributeName, response.getUrl());
        }
      }
      if (record.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_URL)) {
        record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, response.getUrl());
      }
    }
  }

  /** extract metadata from HTTP response. Already existing record metadata values are not overwritten. */
  private void readMetadata(final Record record, final HttpResponseInputStream response) {
    final AnyMap metadata = record.getMetadata();
    final HttpEntity entity = response.getResponseEntity();
    if (entity != null && metadata.get(WebCrawlerConstants.ATTRIBUTE_SIZE) == null) {
      metadata.put(WebCrawlerConstants.ATTRIBUTE_SIZE, entity.getContentLength());
    }
    final Header contentType = response.getResponse().getFirstHeader(HEADER_CONTENTTYPE);
    if (contentType != null && metadata.get(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE) == null) {
      metadata.put(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE, contentType.getValue());
      final HeaderElement[] elements = contentType.getElements();
      if (elements.length > 0) {
        final String mimetype = elements[0].getName();
        if (mimetype != null && metadata.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE) == null) {
          metadata.put(WebCrawlerConstants.ATTRIBUTE_MIMETYPE, mimetype);
        }
        final NameValuePair charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET);
        if (charset != null && metadata.get(WebCrawlerConstants.ATTRIBUTE_CHARSET) == null) {
          metadata.put(WebCrawlerConstants.ATTRIBUTE_CHARSET, charset.getValue());
        }
      }
    }
    final Header date = response.getResponse().getFirstHeader(HEADER_LASTMODIFIED);
    if (date != null) {
      try {
        final Date parsedDate = DateUtils.parseDate(date.getValue());
        if (parsedDate != null && metadata.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED) == null) {
          metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED,
            metadata.getFactory().createDateTimeValue(parsedDate));
        }
      } catch (final DateParseException ex) {
        metadata.put(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED, date.getValue());
      }
    }
  }

  /** get content from response, if it is HTML. */
  private void readHtmlContent(final Record record, final InputStream contentStream) throws IOException {
    final String mimetype = record.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE);
    if (mimetype != null && mimetype.equals("text/html")) {
      readContent(record, contentStream);
    }
  }

  /**
   * get content from response, regardless of mimetype. If content could be read and size attribute is not set or
   * negative, adapt it to the actual size of the content.
   */
  private void readContent(final Record record, final InputStream contentStream) throws IOException {
    final byte[] content = IOUtils.toByteArray(contentStream);
    if (content != null) {
      record.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, content);
      final Long size = record.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE);
      if (size == null || size < 0) {
        record.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_SIZE, content.length);
      }
    }
  }

  private boolean disallowedByRobotsTxt(final String recordUrl, final WebCrawlingContext context)
    throws WebCrawlerException {
    try {
      final URL url = new URL(recordUrl);
      final RobotsTxt hostRobotsTxt = getRobotsTxt(url, context);
      return hostRobotsTxt != null && !hostRobotsTxt.isAllowed(url.getFile());
    } catch (final MalformedURLException ex) {
      throw new WebCrawlerException(ex, false);
    }
  }

  private RobotsTxt getRobotsTxt(final URL url, final WebCrawlingContext context) throws WebCrawlerException {
    final String protocol = url.getProtocol();
    final String hostAndPort = UriHelper.getHostAndPort(url);
    RobotsTxt robotsTxt = context.getRobotsTxt(hostAndPort, _jobRunDataProvider);
    if (robotsTxt == null) {
      robotsTxt = fetchRobotsTxt(protocol, hostAndPort);
      context.putRobotsTxt(hostAndPort, robotsTxt, _jobRunDataProvider);
    }
    return robotsTxt;
  }

  private RobotsTxt fetchRobotsTxt(final String protocol, final String hostAndPort) throws WebCrawlerException {
    _log.info("Fetching robots.txt for " + hostAndPort);
    final String robotsUrl = protocol + "://" + hostAndPort + "/robots.txt";
    try {
      try (HttpResponseInputStream robotsTxtStream = fetchRobotsTxtStream(robotsUrl)) {
        final int responseCode = robotsTxtStream.getResponse().getStatusLine().getStatusCode();
        if (responseCode >= HttpStatus.SC_OK && responseCode <= HttpStatus.SC_MULTIPLE_CHOICES) { // 20x
          return new RobotsTxt(_configuration.getUserAgent(), robotsTxtStream);
        } else if (responseCode == HttpStatus.SC_NOT_FOUND || responseCode == HttpStatus.SC_GONE) {
          return new RobotsTxt(false);
        } else if (responseCode >= HttpStatus.SC_INTERNAL_SERVER_ERROR) { // 500
          throw new IOException("Failed to accesss robots.txt for " + hostAndPort
            + ", server responded with status " + responseCode
            + ", condition may be temporarily, will retry later.");
        } else {
          return new RobotsTxt(true); // other 40x errors -> access to site forbidden.
        }
      }
    } catch (final IOException ex) {
      throw new WebCrawlerException(ex, true);
    } catch (final WebCrawlerException ex) {
      return new RobotsTxt(false); // redirects do not lead to a valid file -> allow access.
    }
  }

  /**
   * fetching stream to robots.txt, following up to 10 redirects.
   */
  private HttpResponseInputStream fetchRobotsTxtStream(final String robotsUrl) throws WebCrawlerException,
    IOException {
    int redirectCount = 0;
    final int maxRedirects = 10;
    String currentUrl = robotsUrl;
    while (redirectCount < maxRedirects) {
      _log.info("Fetching robots.txt at " + currentUrl);
      final HttpGet request = new HttpGet(currentUrl);
      final HttpResponse response = _httpClient.execute(request);
      final HttpResponseInputStream responseStream = new HttpResponseInputStream(currentUrl, response, false);
      final int responseCode = response.getStatusLine().getStatusCode();
      if (isRedirect(responseCode)) {
        IOUtils.closeQuietly(responseStream);
        redirectCount++;
        final Header locationHeader = response.getFirstHeader("location");
        if (locationHeader == null) {
          throw new WebCrawlerException("Received redirect response " + response.getStatusLine()
            + " but no location header set.");
        }
        currentUrl = locationHeader.getValue();
        _log.info("... redirected to " + currentUrl);
      } else {
        return responseStream; // distinction between other status codes must be handled by caller
      }
    }
    throw new WebCrawlerException("Got more than " + maxRedirects + " redirects for fetching " + robotsUrl
      + ", giving up.", false);
  }

  /** DS service reference injection method. */
  public void setVisitedLinks(final VisitedLinksService visitedLinks) {
    _visitedLinks = visitedLinks;
  }

  /** DS service reference removal method. */
  public void unsetVisitedLinks(final VisitedLinksService visitedLinks) {
    if (_visitedLinks == visitedLinks) {
      _visitedLinks = null;
    }
  }

  /** DS service reference injection method. */
  public void setLinkFilter(final LinkFilter linkFilter) {
    _linkFilter = linkFilter;
  }

  /** DS service reference removal method. */
  public void unsetLinkFilter(final LinkFilter linkFilter) {
    if (_linkFilter == linkFilter) {
      _linkFilter = null;
    }
  }

  /** DS service reference injection method. */
  public void setJobRunDataProvider(final JobRunDataProvider jobRunDataProvider) {
    _jobRunDataProvider = jobRunDataProvider;
  }

  /** DS service reference removal method. */
  public void unsetJobRunDataProvider(final JobRunDataProvider jobRunDataProvider) {
    if (_jobRunDataProvider == jobRunDataProvider) {
      _jobRunDataProvider = null;
    }
  }

}
