/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web.test;

import java.util.ArrayList;
import java.util.Collection;

import org.apache.commons.collections.CollectionUtils;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.filter.SimpleLinkFilter;
import org.eclipse.smila.taskworker.DefaultTaskLogFactory;
import org.eclipse.smila.taskworker.TaskLog;
import org.eclipse.smila.test.DeclarativeServiceTestCase;

/** test class for SimpleFetcher. */
public class TestSimpleLinkFilter extends DeclarativeServiceTestCase {
  private static final String START_URL = "http://www.example.com/";

  /** dummy logger. */
  private final TaskLog _taskLog = new DefaultTaskLogFactory().getTaskLog(null);

  private LinkFilter _filter;

  /** test if SimpleFetcher is active. */
  @Override
  protected void setUp() throws Exception {
    _filter = getService(LinkFilter.class);
  }

  /** assert that we are testing the right implementation. */
  public void testService() throws Exception {
    assertTrue(_filter instanceof SimpleLinkFilter);
  }

  /** check if one link is kept. */
  public void testEmptyLinks() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(extractedLinks, filteredLinks);
  }

  /** check if one ".html" link is kept. */
  public void testKeepOneLink() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "index.html"));
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(extractedLinks, filteredLinks);
  }

  /** check if link with parameters is removed. */
  public void testRemoveParameterLink() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "index.php?query=test"));
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertTrue(filteredLinks.isEmpty());
  }

  /** check if link to other host is removed. */
  public void testRemoveExternalLink() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord("http://www.theothers.com/index.html"));
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertTrue(filteredLinks.isEmpty());
  }

  /** check if fragment part of link is removed. */
  public void testKeepBaseOfFragmentLink() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "index.html#Documentation"));
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(1, filteredLinks.size());
    assertEquals(START_URL + "index.html",
      filteredLinks.iterator().next().getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL));
  }

  /** combined test. */
  public void testFilterLinkSet() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "index.html"));
    extractedLinks.add(createLinkRecord(START_URL + "index.htm"));
    extractedLinks.add(createLinkRecord(START_URL + "INDEX.HTML"));
    final String invalidUrl = START_URL + "index.php?query=test";
    extractedLinks.add(createLinkRecord(invalidUrl));
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(3, filteredLinks.size());
    extractedLinks.removeAll(filteredLinks);
    assertEquals(1, extractedLinks.size());
    assertEquals(invalidUrl, extractedLinks.iterator().next().getMetadata().getStringValue("http.url"));
  }

  /** check if one ".html" link is kept. */
  public void testFilterDuplicateLinks() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "index.html"));
    extractedLinks.add(createLinkRecord(START_URL + "index.html"));
    final AnyMap parameters = createParameters(START_URL);
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(1, filteredLinks.size());
    assertEquals(CollectionUtils.get(extractedLinks, 0), CollectionUtils.get(filteredLinks, 0));
  }

  /** test link exclusion by url prefix. */
  public void testFilterLinksWithPrefixRestriction() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "crawl/page.html"));
    extractedLinks.add(createLinkRecord(START_URL + "dontcrawl/page.html"));
    final AnyMap parameters = createParameters(START_URL, START_URL + "crawl");
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(1, filteredLinks.size());
    assertEquals(CollectionUtils.get(extractedLinks, 0), CollectionUtils.get(filteredLinks, 0));
  }

  /** test link exclusion by url prefix. */
  public void testMatchPrefixCaseSensitive() throws Exception {
    final Record sourceLink = createLinkRecord(START_URL);
    final Collection<Record> extractedLinks = new ArrayList<Record>();
    extractedLinks.add(createLinkRecord(START_URL + "crawl/page1.html"));
    extractedLinks.add(createLinkRecord(START_URL + "CRAWL/page2.html"));
    final AnyMap parameters = createParameters(START_URL, START_URL + "crawl");
    final Collection<Record> filteredLinks = _filter.filterLinks(extractedLinks, sourceLink, parameters, _taskLog);
    assertEquals(1, filteredLinks.size());
    assertEquals(CollectionUtils.get(extractedLinks, 0), CollectionUtils.get(filteredLinks, 0));
  }

  /** create a link record. */
  private Record createLinkRecord(final String url) {
    final Record record =
      DataFactory.DEFAULT.createRecord("TestSimpleLinkFilter", Long.toString(System.nanoTime()));
    record.getMetadata().put("http.url", url);
    return record;
  }

  /** create a parameter map with a urlPrefix. */
  private AnyMap createParameters(final String startUrl, final String urlPrefix) {
    final AnyMap parameters = createParameters(startUrl);
    final AnyMap filterParameters = parameters.getMap("filter", true);
    filterParameters.put("urlPrefix", urlPrefix);
    return parameters;
  }

  /** create a parameter map with a urlPrefix. */
  private AnyMap createParameters(final String startUrl) {
    final AnyMap parameters = DataFactory.DEFAULT.createAnyMap();
    parameters.put(ImportingConstants.TASK_PARAM_START_URL, startUrl);
    return parameters;
  }
}
