/*********************************************************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved.
 * This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 *********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web.filter;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;

/** Default {@link LinkFilter} implementation. */
public class DefaultLinkFilter implements LinkFilter {
  /** reference to JobRunDataProvider service. */
  private JobRunDataProvider _jobRunDataProvider;

  private final Log _log = LogFactory.getLog(getClass());

  @Override
  public Collection<Record> filterExtractedLinks(final Collection<Record> extractedLinks, final String sourceUrl,
    final WebCrawlingContext context) throws WebCrawlerException {
    final Set<String> links = new HashSet<String>();
    final Collection<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size());
    try {
      final String sourceHost = UriHelper.getHost(sourceUrl);
      final String sourceDomain = UriHelper.getDomain(sourceHost);
      for (final Record link : extractedLinks) {
        try {
          final String urlString = link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
          final URL url = new URL(urlString);
          if (checkStayOn(url, sourceHost, sourceDomain, context) //
            && allowedByRobotsTxt(url, context) //
            && isLinkAllowed(context.getFilterConfiguration(), urlString, links)) {
            filteredLinks.add(link);
          }
        } catch (final MalformedURLException e) {
          ; // ignore
        }
      }
    } catch (final MalformedURLException e) {
      ; // ignore
    }
    return filteredLinks;
  }

  /** check if context has a cached robots.txt for host of the URL and if it allows the URL to be crawled. */
  private boolean allowedByRobotsTxt(final URL url, final WebCrawlingContext context) throws MalformedURLException {
    final String hostAndPort = UriHelper.getHostAndPort(url);
    final RobotsTxt robotsTxt = context.getRobotsTxt(hostAndPort, _jobRunDataProvider);
    final boolean allowed = robotsTxt == null || robotsTxt.isAllowed(url.getFile());
    if (!allowed) {
      _log.info("URL " + url + " not allowed by robots.txt");
    }
    return allowed;
  }

  /** check if URL is no duplicate (not already contained in <tt>links</tt>) and is not filtered out by url patterns. */
  private boolean isLinkAllowed(final FilterConfiguration filterConfig, final String url, final Set<String> links) {
    if (!links.add(url)) {
      // filter out duplicates
      return false;
    }
    if (filterConfig != null && !filterConfig.getUrlPatternMatcher().matches(url)) {
      // filter out non-matching URLs
      return false;
    }
    return true;
  }

  @Override
  public boolean allowRedirectLink(final String url, final String originalUrl, final WebCrawlingContext context)
    throws WebCrawlerException {
    if (url != null) {
      final FilterConfiguration filterConfig = context.getFilterConfiguration();
      if (filterConfig != null) {
        try {
          final String sourceHost = UriHelper.getHost(originalUrl);
          final String sourceDomain = UriHelper.getDomain(sourceHost);
          if (checkStayOn(url, sourceHost, sourceDomain, context)
            && filterConfig.getUrlPatternMatcher().matches(url)) {
            return true;
          }
        } catch (final MalformedURLException e) {
          return false;
        }
      }
    }
    return false;
  }

  /** checks whether the extracted link would leave the current host/domain, and if so, whether this is allowed. */
  private boolean checkStayOn(final String url, final String currentHost, final String currentDomain,
    final WebCrawlingContext context) {
    try {
      return checkStayOn(new URL(url), currentHost, currentDomain, context);
    } catch (final MalformedURLException e) {
      return false;
    }
  }

  /** checks whether the extracted link would leave the current host/domain, and if so, whether this is allowed. */
  private boolean checkStayOn(final URL url, final String currentHost, final String currentDomain,
    final WebCrawlingContext context) {
    if (context.getFilterConfiguration().isStayOnHost()) {
      final String host = url.getHost();
      if (!currentHost.equals(host)) {
        return false;
      }
    } else if (context.getFilterConfiguration().isStayOnDomain()) {
      final String host = url.getHost();
      final String domain = UriHelper.getDomain(host);
      if (!currentDomain.equals(domain)) {
        return false;
      }
    }
    return true; // input url is ok
  }

  /** DS service reference injection method. */
  public void setJobRunDataProvider(final JobRunDataProvider jobRunDataProvider) {
    _jobRunDataProvider = jobRunDataProvider;
  }

  /** DS service reference removal method. */
  public void unsetJobRunDataProvider(final JobRunDataProvider jobRunDataProvider) {
    if (_jobRunDataProvider == jobRunDataProvider) {
      _jobRunDataProvider = null;
    }
  }

}
