/*********************************************************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
 * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
 * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 *********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web.filter;

import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.importing.util.RegexPatternMatcher;

/** web crawler filter configuration, used by @ LinkFilter} class. */
public class FilterConfiguration {

  public static final String MAX_CRAWL_DEPTH = "maxCrawlDepth";

  public static final String FOLLOW_REDIRECTS = "followRedirects";

  public static final String MAX_REDIRECTS = "maxRedirects";

  /** Name of the task parameter that signals whether to stay on host resp. domain. */
  public static final String STAY_ON = "stayOn";

  public static final String URL_PATTERNS = "urlPatterns";

  public static final String INCLUDE_PATTERNS = "include";

  public static final String EXCLUDE_PATTERNS = "exclude";

  private static final long DEFAULT_NUMBER_OF_REDIRECTS = 1;

  /** Possible values for 'stayOn' parameter. */
  public enum StayOn {
    /** */
    host, domain;
  }

  /** the maximum depth when following links. negative value: unlimited */
  private long _maxCrawlDepth = -1;

  /** whether to follow redirects or not. */
  private boolean _followRedirects;

  /** the maximum number of allowed redirects when following links. */
  private long _maxRedirects = DEFAULT_NUMBER_OF_REDIRECTS;

  /** matcher for checking include and exclude patterns for URLs. */
  private final RegexPatternMatcher _urlMatcher = new RegexPatternMatcher();

  /** whether to stay on host resp. domain or not. */
  private StayOn _stayOn;

  /**
   * @param filterConfig
   *          filter section from file crawler configuration.
   */
  public FilterConfiguration(final AnyMap filterConfig) {
    if (filterConfig.containsKey(MAX_CRAWL_DEPTH)) {
      _maxCrawlDepth = filterConfig.getLongValue(MAX_CRAWL_DEPTH);
      if (_maxCrawlDepth < 0) {
        _maxCrawlDepth = Long.MAX_VALUE;
      }
    }
    if (filterConfig.containsKey(FOLLOW_REDIRECTS)) {
      _followRedirects = filterConfig.getBooleanValue(FOLLOW_REDIRECTS);
    }
    if (filterConfig.containsKey(MAX_REDIRECTS)) {
      _maxRedirects = filterConfig.getLongValue(MAX_REDIRECTS);
      if (_maxRedirects <= 0) {
        _maxRedirects = DEFAULT_NUMBER_OF_REDIRECTS;
      }
    }

    if (filterConfig.containsKey(STAY_ON)) {
      try {
        _stayOn = StayOn.valueOf(filterConfig.getStringValue(STAY_ON));
      } catch (final IllegalArgumentException e) {
        ; // ignore
      }
    }

    if (filterConfig.containsKey(URL_PATTERNS)) {
      final AnyMap patterns = filterConfig.getMap(URL_PATTERNS);
      if (patterns.containsKey(INCLUDE_PATTERNS)) {
        final AnySeq includes = patterns.getSeq(INCLUDE_PATTERNS);
        for (final Any include : includes) {
          _urlMatcher.addIncludePattern(include.asValue().asString());
        }
      }
      if (patterns.containsKey(EXCLUDE_PATTERNS)) {
        final AnySeq excludes = patterns.getSeq(EXCLUDE_PATTERNS);
        for (final Any exclude : excludes) {
          _urlMatcher.addExcludePattern(exclude.asValue().asString());
        }
      }
    }
  }

  /** @return matcher for checking include and exclude patterns of URLs. */
  public RegexPatternMatcher getUrlPatternMatcher() {
    return _urlMatcher;
  }

  /** @return 'true' if we should follow redirects, 'false' otherwise. */
  public boolean followRedirects() {
    return _followRedirects;
  }

  /** @return the maximum number of allowed redirects when following links. */
  public long getMaxRedirects() {
    return _maxRedirects;
  }

  /** @return the maximum depth when following links. */
  public long getMaxCrawlDepth() {
    return _maxCrawlDepth;
  }

  /** @return whether URLs that would leave the host should be crawled. */
  public boolean isStayOnHost() {
    return _stayOn == StayOn.host;
  }

  /** @return whether URLs that would leave the domain should be crawled. */
  public boolean isStayOnDomain() {
    return _stayOn == StayOn.domain;
  }
}
