/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.test;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.extractor.DefaultLinkExtractor;
import org.eclipse.smila.importing.crawler.web.extractor.LinkExtractorHtmlSoup;

/** Test for {@link DefaultLinkExtractor} class. */
public class TestDefaultLinkExtractor extends WebHelperTestBase {

  private static final String BASE_URI = "http://www.empolis.com";

  private static final String BASE_URI_WITH_PATH = BASE_URI + "/p";

  private static final String BASE_URI_WITH_PATH_AND_FILE = BASE_URI_WITH_PATH + "/test.html";

  private DefaultLinkExtractor _extractor;

  @Override
  protected void setUp() throws Exception {
    super.setUp();
    _extractor = (DefaultLinkExtractor) getService(LinkExtractor.class);
  }

  /** test for extracting absolute link. */
  public void testSimpleHref() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link = BASE_URI_WITH_PATH + "/link.html";
    testData.put(link, link);
    final String htmlString = "<html> <a href=\"" + link + "\"/> </html>";
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** test for extracting absolute link with incomplete html. */
  public void testSimpleHrefIncompleteHtml() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link = BASE_URI_WITH_PATH + "/link.htm";
    testData.put(link, link);
    final String htmlString = "<html> <a href=\"" + link + "\"";
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** test for extracting relative links. */
  public void testRelativeLinks() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    // key: test link, value: expected absolute link
    testData.put("/link1.html", BASE_URI + "/link1.html");
    testData.put("link2.html", BASE_URI_WITH_PATH + "/link2.html");
    testData.put("../link3.htm", BASE_URI + "/link3.htm");
    testData.put("./link4.htm", BASE_URI_WITH_PATH + "/link4.htm");
    final String htmlString = createHtmlString(testData);
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /**
   * tests that links can be extracted even if HTML is malformed. This test succeeds for tagsoup but fails for nekohtml.
   */
  public void testMalformedHtml() throws Exception {
    final String link1 = BASE_URI_WITH_PATH + "/link1.html";
    final String link2 = BASE_URI_WITH_PATH + "/link2.html";
    final Map<String, String> testData = new HashMap<String, String>();
    testData.put(link1, link1);
    testData.put(link2, link2);
    final String htmlString = "<p> <title> </p> " //
      + "<a href=\"" + link1 + "\"> </A>" //
      + "<p> </title> </p> </p>" //
      + "<A HrEF=\"" + link2 + "\"> </a>"; //
    final Record inputRecord = createInputRecord(htmlString);
    // this only works for tagsoup!
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** test for extracting links from 'FRAME' element. */
  public void testFrameLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/navigation_Left.htm";
    final String link2 = BASE_URI_WITH_PATH + "/introduction.htm";
    testData.put(link1, link1);
    testData.put(link2, link2);
    final String htmlString = "<frameset> " //
      + "<frame name=\"Navigation_Frame\" src=\"navigation_Left.htm\" marginheight=\"0\" marginwidth=\"0\"/>" //
      + "<FRAME name=\"ContentFrame\" SRC=\"introduction.htm\" marginheight=\"0\" marginwidth=\"0\"/>" //
      + "</frameset>";
    final Record inputRecord = createInputRecord(htmlString);
    // this only works for tagsoup!
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** test for extracting links from 'IMG' element. */
  public void testImageLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/icon.gif";
    final String link2 = BASE_URI_WITH_PATH + "/images/picture.jpg";
    testData.put(link1, link1);
    testData.put(link2, link2);
    final String htmlString = "<html><body><img src=\"" + link1 + "\"> <img src=\"" + link2 + "\"></body></html>";
    final Record inputRecord = createInputRecord(htmlString);
    // this only works for tagsoup!
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** test with unescaped link. */
  public void testUnescapedLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/test1.html";
    final String linkUnescapedRel = "this link is not escaped/?query=a b"; // yes it can be extracted!
    final String linkUnescapedAbs = "http://www.attensity.com/this link is not escaped/?query=a b";
    final String link2 = BASE_URI_WITH_PATH + "/test2.html";
    testData.put(link1, link1);
    testData.put(linkUnescapedRel, BASE_URI_WITH_PATH + "/this%20link%20is%20not%20escaped/?query=a%20b");
    testData.put(linkUnescapedAbs, "http://www.attensity.com/this%20link%20is%20not%20escaped/?query=a%20b");
    testData.put(link2, link2);
    final String htmlString = "<title>" //
      + "<a href=\"" + link1 + "\"> </a>" //
      + "<a href=\"" + linkUnescapedRel + "\"> </a>" //
      + "<a href=\"" + linkUnescapedAbs + "\"> </a>" //
      + "<a href=\"" + link2 + "\"> </a>"; //
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** test with escaped link. */
  public void testEscapedLink() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/test1.html";
    final String linkEscapedAbs = "http://www.empolis.com/p/this%20link%20is%20escaped/?query=a%20b";
    final String linkEscapedRel = "this%20link%20is%20escaped/?query=a%20b";
    final String link2 = BASE_URI_WITH_PATH + "/test2.html";
    testData.put(link1, link1);
    testData.put(linkEscapedAbs, linkEscapedAbs);
    testData.put(linkEscapedRel, linkEscapedAbs);
    testData.put(link2, link2);
    final String htmlString = "<title>" //
      + "<a href=\"" + link1 + "\"> </a>" //
      + "<a href=\"" + linkEscapedAbs + "\"> </a>" //
      + "<a href=\"" + linkEscapedRel + "\"> </a>" //
      + "<a href=\"" + link2 + "\"> </a>"; //
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** test link normlization. */
  public void testLinkNormalization() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link0 = "http://www.attensity.com"; // / is added if path is empty 
    final String link1 = "HTtp://WWW.Attensity.com:8080/Test1.html"; // -> scheme and host are converted to lower case
    final String link2 = "http://www.attensity.com#fragment"; // -> fragment parts are removed
    final String link3 = "http://www.attensity.com/?Query=q&query2=q"; // -> query parts remain
    final String link4 = "http://www.attensity.com:80/port"; // -> default port 80 is removed
    final String link5 = "http://www.attensity.com/test unescaped"; // -> link will be escaped
    final String link6 = "http://www.attensity.com/test%20escaped"; // -> escaped link remains
    final String link7 = "http://www.attensity.com/path/../path2"; // -> path is normalized
    final String link8 = "javascript:void(0);"; // invalid
    final String link9 = "mailto:andreas.weber@empolis.com"; // invalid

    testData.put(link0, "http://www.attensity.com/");
    testData.put(link1, "http://www.attensity.com:8080/Test1.html");
    testData.put(link2, "http://www.attensity.com/");
    testData.put(link3, link3);
    testData.put(link4, "http://www.attensity.com/port");
    testData.put(link5, "http://www.attensity.com/test%20unescaped");
    testData.put(link6, link6);
    testData.put(link7, "http://www.attensity.com/path2");
    final String htmlString = "<title>" //
      + "<a href=\"" + link0 + "\"> </a>" //
      + "<a href=\"" + link1 + "\"> </a>" //
      + "<a href=\"" + link2 + "\"> </a>" //
      + "<a href=\"" + link3 + "\"> </a>" //
      + "<a href=\"" + link4 + "\"> </a>" //
      + "<a href=\"" + link5 + "\"> </a>" //
      + "<a href=\"" + link6 + "\"> </a>" //
      + "<a href=\"" + link7 + "\"> </a>" //
      + "<a href=\"" + link8 + "\"> </a>" //
      + "<a href=\"" + link9 + "\"> </a>"; //
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** tests that "base"-tag is considered for relative links. */
  public void testRelativeLinksWithBase() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = "http://www.empolis.com/link1";
    final String link2 = "http://www.tecinno.de/link2";
    testData.put(link1, link1);
    testData.put(link2, link2);
    String s = "<html>";
    s = s + "<base href=\"http://www.empolis.com/\">";
    s = s + "<bla/>";
    s = s + "<a href=\"link1\"/>";
    s = s + "<a href=\"http://www.tecinno.de/link2\"/>";
    final Record inputRecord = createInputRecord(s);
    doTest(inputRecord, testData);
  }

  /** test with outcommented link which should not be extracted. */
  public void testOutcommentedLinks() throws Exception {
    final Map<String, String> testData = new HashMap<String, String>();
    final String link1 = BASE_URI_WITH_PATH + "/link1.html";
    final String link2 = BASE_URI_WITH_PATH + "/link2.html";
    final String link3 = BASE_URI_WITH_PATH + "/link3.html";
    testData.put(link1, link1);
    testData.put(link3, link3);
    final String htmlString = "<title>" //
      + "<a href=\"" + link1 + "\"> </a>" //
      + "<!-- " //
      + "<a href=\"" + link2 + "\"> </a>" //
      + "--> " //
      + "<a href=\"" + link3 + "\"> </a>";
    final Record inputRecord = createInputRecord(htmlString);
    doTest(inputRecord, testData);
  }

  /** do testing with nekohtml and tagsoup html parser. */
  private void doTest(final Record inputRecord, final Map<String, String> testData) throws Exception {
    // tagsoup
    _extractor.setLinkExtractorHtml(new LinkExtractorHtmlSoup());
    final Collection<Record> result = _extractor.extractLinks(inputRecord, _webCrawlingContext);
    checkResults(new ArrayList<String>(testData.values()), result);
  }

  /** helper method to create html input from map with URIs. */
  private String createHtmlString(final Map<String, String> uriMap) {
    String s = "<html>";
    for (final String uri : uriMap.keySet()) {
      s = s + "<a href=\"" + uri + "\"/>";
      s = s + "<irgendein> html <schrott> der dazwischen <steht>";
    }
    return s;
  }

  /** helper method to create input record from html. */
  private Record createInputRecord(final String htmlContent) {
    final Record r = DataFactory.DEFAULT.createRecord();
    final byte[] htmlBytes = htmlContent.getBytes();
    r.setAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT, htmlBytes);
    r.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASE_URI_WITH_PATH_AND_FILE);
    return r;
  }

  /** helper method to check the extracted links. */
  private void checkResults(final List<String> expectedLinks, final Collection<Record> actualRecords) {
    while (expectedLinks.contains(null)) {
      expectedLinks.remove(null);
    }
    assertEquals(expectedLinks.size(), actualRecords.size());
    final List<String> actualLinks = new ArrayList<String>();
    for (final Record r : actualRecords) {
      actualLinks.add(r.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL));
    }
    Collections.sort(expectedLinks);
    Collections.sort(actualLinks);
    assertEquals(expectedLinks, actualLinks);
  }
}
