package org.eclipse.smila.importing.crawler.web.filter;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.taskworker.TaskLog;

/** Default @ LinkFilter} implementation. */
public class DefaultLinkFilter implements LinkFilter {

  @Override
  public Collection<Record> filterLinks(final Collection<Record> extractedLinks, final WebCrawlingContext context)
    throws WebCrawlerException {
    final Set<String> links = new HashSet<String>();
    final Collection<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size());
    for (final Record link : extractedLinks) {
      final String url = link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL);
      if (isLinkAllowed(context.getFilterConfiguration(), url, links, context.getTaskLog())) {
        filteredLinks.add(link);
      }
    }
    return filteredLinks;
  }

  @Override
  public boolean allowLink(final String url, final WebCrawlingContext context) throws WebCrawlerException {
    if (url != null) {
      final FilterConfiguration filterConfig = context.getFilterConfiguration();
      if (filterConfig != null && filterConfig.getUrlPatternMatcher().matches(url)) {
        return true;
      }
    }
    return false;
  }

  /** check if URL is no duplicate (not already contained in <tt>links</tt>) and is not filtered out by url patterns. */
  private boolean isLinkAllowed(final FilterConfiguration filterConfig, final String url, final Set<String> links,
    final TaskLog log) {
    if (!links.add(url)) {
      // filter out duplicates
      return false;
    }
    if (filterConfig != null && !filterConfig.getUrlPatternMatcher().matches(url)) {
      // filter out non-matching URLs
      return false;
    }
    return true;
  }

}
