/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web.filter;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.taskworker.TaskLog;

/**
 * Simple example implementation:
 * <ul>
 * <li>Removed fragment parts from URLs ("#...")
 * <li>Filter links with parameters ("?...")
 * <li>Filter links to other hosts.
 * </ul>
 * Also removes duplicates with exactly the same URL.
 */
public class SimpleLinkFilter implements LinkFilter {

  @Override
  public Collection<Record> filterLinks(final Collection<Record> extractedLinks, final Record sourceLink,
    final AnyMap parameters, final TaskLog taskLog) throws WebCrawlerException {
    final Set<String> links = new HashSet<String>();
    final Collection<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size());
    final FilterConfiguration configuration = new FilterConfiguration(parameters, taskLog);
    for (final Record link : extractedLinks) {
      final String url = normalizeUrl(link);
      if (isLinkAllowed(url, links, configuration, taskLog)) {
        filteredLinks.add(link);
      }
    }
    return filteredLinks;
  }

  /** cut of fragment parts. */
  private String normalizeUrl(final Record link) {
    final String url = link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
    final int hashIndex = url.indexOf('#');
    if (hashIndex > 0) {
      final String baseUrl = url.substring(0, hashIndex);
      link.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, baseUrl);
      return baseUrl;
    }
    return url;
  }

  /**
   * check if URL does not contain parameters (question mark) and is not already contained in <tt>links</tt> set.
   * (case-sensitive). Links to other hosts (case-insensitive are also removed.
   */
  private boolean isLinkAllowed(final String url, final Set<String> links, final FilterConfiguration configuration,
    final TaskLog log) {
    if (links.add(url)) {
      if (url.indexOf('?') > 0) {
        return false;
      }
      if (configuration.getUrlPrefix() != null && !url.startsWith(configuration.getUrlPrefix())) {
        return false;
      }
      final String linkHost = getHost(url, log);
      if (linkHost == null || !linkHost.equalsIgnoreCase(configuration.getStartHost())) {
        return false;
      }
      return true;
    }
    return false;
  }

  /** @return host part of URL in link record. */
  protected static String getHost(final String urlString, final TaskLog log) {
    try {
      final URL url = new URL(urlString);
      return url.getHost();
    } catch (final MalformedURLException ex) {
      log.info("Failed to extract hostname from " + urlString + ": " + ex.toString());
      return null;
    }
  }

}
