/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.test;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlNeko;
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlSoup;
import org.eclipse.smila.importing.crawler.web.extractor.SimpleLinkExtractor;
import org.eclipse.smila.taskworker.DefaultTaskLogFactory;
import org.eclipse.smila.taskworker.TaskLog;
import org.eclipse.smila.test.DeclarativeServiceTestCase;

/** Test for {@link SimpleLinkExtractor} class. */
public class TestSimpleLinkExtractor extends DeclarativeServiceTestCase {

  private static final String BASE_URI = "http://www.attensity.com";

  private static final String BASE_URI_WITH_PATH = BASE_URI + "/p";

  private static final String BASE_URI_WITH_PATH_AND_FILE = BASE_URI_WITH_PATH + "/test.html";

  private SimpleLinkExtractor _extractor;

  private final TaskLog _taskLog = new DefaultTaskLogFactory().getTaskLog(null);

  @Override
  protected void setUp() throws Exception {
    super.setUp();
    _extractor = (SimpleLinkExtractor) getService(LinkExtractor.class);
  }

  /** test for extracting absolute link. */
  public void testSimpleHref() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link = BASE_URI_WITH_PATH + "/link.html";
    testData.put(link, link);
    final String htmlString = "<html> <a href=\"" + link + "\"/> </html>";
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** test for extracting absolute link with incomplete html. */
  public void testSimpleHrefIncompleteHtml() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link = BASE_URI_WITH_PATH + "/link.htm";
    testData.put(link, link);
    final String htmlString = "<html> <a href=\"" + link + "\"";
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** test for extracting relative links. */
  public void testRelativeLinks() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    // key: test link, value: expected absolute link
    testData.put("/link1.html", BASE_URI + "/link1.html");
    testData.put("link2.html", BASE_URI_WITH_PATH + "/link2.html");
    testData.put("../link3.htm", BASE_URI + "/link3.htm");
    testData.put("./link4.htm", BASE_URI_WITH_PATH + "/link4.htm");
    final String htmlString = createHtmlString(testData);
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /**
   * tests that links can be extracted even if HTML is malformed. This test succeeds for tagsoup but fails for nekohtml.
   */
  public void testMalformedHtml() throws Exception {
    final String link1 = BASE_URI_WITH_PATH + "/link1.html";
    final String link2 = BASE_URI_WITH_PATH + "/link2.html";
    final Map<String, String> testData = new HashMap<String, String>();
    testData.put(link1, link1);
    testData.put(link2, link2);
    final String htmlString = "<p> <title> </p> " //
      + "<a href=\"" + link1 + "\"> </A>" //
      + "<p> </title> </p> </p>" //
      + "<A HrEF=\"" + link2 + "\"> </a>"; //
    final Record inputRecord = createInputRecord(htmlString);
    // this only works for tagsoup!
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, null, _taskLog);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** test for extracting links from 'FRAME' element. */
  public void testFrameLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/navigation_Left.htm";
    final String link2 = BASE_URI_WITH_PATH + "/introduction.htm";
    testData.put(link1, link1);
    testData.put(link2, link2);
    final String htmlString = "<frameset> " //
      + "<frame name=\"Navigation_Frame\" src=\"navigation_Left.htm\" marginheight=\"0\" marginwidth=\"0\"/>" //
      + "<FRAME name=\"ContentFrame\" SRC=\"introduction.htm\" marginheight=\"0\" marginwidth=\"0\"/>" //
      + "</frameset>";
    final Record inputRecord = createInputRecord(htmlString);
    // this only works for tagsoup!
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, null, _taskLog);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** test for extracting links from 'IMG' element. */
  public void testImageLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/icon.gif";
    final String link2 = BASE_URI_WITH_PATH + "/images/picture.jpg";
    testData.put(link1, link1);
    testData.put(link2, link2);
    final String htmlString = "<html><body><img src=\"" + link1 + "\"> <img src=\"" + link2 + "\"></body></html>";
    final Record inputRecord = createInputRecord(htmlString);
    // this only works for tagsoup!
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, null, _taskLog);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** test with unescaped link. */
  public void testInvalidLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/test1.html";
    final String linkUnescaped = "this link + is not escaped";
    final String link2 = BASE_URI_WITH_PATH + "/test2.html";
    testData.put(link1, link1);
    testData.put(linkUnescaped, "http://www.attensity.com/p/this%20link%20%2B%20is%20not%20escaped");
    testData.put(link2, link2);
    final String htmlString = "<title>" // 
      + "<a href=\"" + link1 + "\"> </a>" //
      + "<a href=\"" + linkUnescaped + "\"> </a>" //
      + "<a href=\"" + link2 + "\"> </a>"; //
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** do testing with nekohtml and tagsoup html parser. */
  private void doTest(final Record inputRecord, final Map<String, String> testData) throws Exception {
    // neko
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlNeko());
    Collection<Record> result = _extractor.extractLinks(inputRecord, null, _taskLog);
    checkResults(new ArrayList<String>(testData.values()), result);
    // tagsoup
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    result = _extractor.extractLinks(inputRecord, null, _taskLog);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** helper method to create html input from map with URIs. */
  private String createHtmlString(final Map<String, String> uriMap) {
    String s = "<html>";
    for (final String uri : uriMap.keySet()) {
      s = s + "<a href=\"" + uri + "\"/>";
      s = s + "<irgendein> html <schrott> der dazwischen <steht>";
    }
    return s;
  }

  /** helper method to create input record from html. */
  private Record createInputRecord(final String htmlContent) {
    final Record r = DataFactory.DEFAULT.createRecord();
    final byte[] htmlBytes = htmlContent.getBytes();
    r.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, htmlBytes);
    r.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASE_URI_WITH_PATH_AND_FILE);
    return r;
  }

  /** helper method to check the extracted links. */
  private void checkResults(final List<String> expectedLinks, final Collection<Record> actualRecords) {
    while (expectedLinks.contains(null)) {
      expectedLinks.remove(null);
    }
    assertEquals(expectedLinks.size(), actualRecords.size());
    final List<String> actualLinks = new ArrayList<String>();
    for (final Record r : actualRecords) {
      actualLinks.add(r.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL));
    }
    Collections.sort(expectedLinks);
    Collections.sort(actualLinks);
    assertEquals(expectedLinks, actualLinks);
  }
}
