/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web.test;

import java.io.IOException;
import java.io.InputStream;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.util.Arrays;

import org.apache.commons.io.IOUtils;
import org.apache.http.client.RedirectException;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.http.server.HttpService;
import org.eclipse.smila.importing.ContentFetcher;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.fetcher.DefaultFetcher;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.test.httphandler.RedirectHandler;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.utils.config.ConfigUtils;

/** test class for SimpleFetcher. */
public class TestDefaultFetcher extends WebExtractorTestBase {

  private Fetcher _fetcher;

  private VisitedLinksService _visitedLinks;

  private final PropertyNameMapper _mapper = _webCrawlingContext.getMapper();

  /** test if SimpleFetcher is active. */
  @Override
  protected void setUp() throws Exception {
    _fetcher = getService(Fetcher.class);
    _visitedLinks = getService(VisitedLinksService.class);
    _visitedLinks.clearAll();
  }

  /** assert that we are testing the right implementation. */
  public void testService() throws Exception {
    assertTrue(_fetcher instanceof DefaultFetcher);
    assertNotNull(_visitedLinks);
  }

  /** just check if the test web server has been started correctly. */
  public void testWebServerAvailable() throws Exception {
    final HttpService server = getService(HttpService.class);
    assertNotNull(server);
    final URL testUrl = new URL("http://localhost:8765/files/");
    final InputStream testStream = testUrl.openStream();
    try {
      final byte[] content = IOUtils.toByteArray(testStream);
      Arrays.equals(getFileContent("index.html"), content);
    } finally {
      IOUtils.closeQuietly(testStream);
    }
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlHtml() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlHtml", getClass().getName());
    final String url = "http://localhost:8765/files/";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.crawl(url, link, _webCrawlingContext);
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(83, content.length);
    Arrays.equals(getFileContent("index.html"), content);
    assertEquals("text/html", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
    assertEquals("text/html", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
    assertFalse(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_CHARSET));
    assertEquals(83, link.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE).intValue());
    assertTrue(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED));
    assertTrue(link.getMetadata().get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED).isDateTime());
  }

  /** crawl plain text page, don't fetch content yet. */
  public void testCrawlPlainText() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlPlainText", getClass().getName());
    final String url = "http://localhost:8765/files/plain.txt";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.crawl(url, link, _webCrawlingContext);
    assertFalse(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    assertEquals("text/plain", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
    assertEquals("text/plain", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
    assertFalse(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_CHARSET));
    assertEquals(18, link.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE).intValue());
    assertTrue(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED));
    assertTrue(link.getMetadata().get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED).isDateTime());
  }

  /** crawl binary, don't fetch content yet. */
  public void testCrawlBinary() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlBinary", getClass().getName());
    final String url = "http://localhost:8765/files/icon.png";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.crawl(url, link, _webCrawlingContext);
    assertFalse(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    assertEquals("image/png", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
    assertEquals("image/png", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
    assertFalse(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_CHARSET));
    assertEquals(1157, link.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE).intValue());
    assertTrue(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED));
    assertTrue(link.getMetadata().get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED).isDateTime());
  }

  /** throw non-recoverable exception when crawling a resource that does not exist. */
  public void testCrawlMissingLink() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlMissingLink", getClass().getName());
    final String url = "http://localhost:8765/files/no.such.resource";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    try {
      _fetcher.crawl(url, link, _webCrawlingContext);
      fail("should not work");
    } catch (final WebCrawlerException ex) {
      assertFalse(ex.isRecoverable());
    }
  }

  /** fetch a single HTML page. no other attributes are set. */
  public void testFetchHtml() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testFetchHtml", getClass().getName());
    final String url = "http://localhost:8765/files/";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.fetch(url, link, _webCrawlingContext);
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(83, content.length);
    Arrays.equals(getFileContent("index.html"), content);
    assertEquals(4, link.getMetadata().size()); // only source, id, size and url attribute set.
  }

  /** fetch a single HTML page. no other attributes are set. */
  public void testFetchPlainText() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testFetchPlainText", getClass().getName());
    final String url = "http://localhost:8765/files/plain.txt";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.fetch(url, link, _webCrawlingContext);
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(18, content.length);
    Arrays.equals(getFileContent("plain.txt"), content);
    assertEquals(4, link.getMetadata().size()); // only source, id, size and url attribute set.
  }

  /** fetch binary, don't fetch content yet. */
  public void testFetchBinary() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testFetchBinary", getClass().getName());
    final String url = "http://localhost:8765/files/icon.png";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.fetch(url, link, _webCrawlingContext);
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(1157, content.length);
    Arrays.equals(getFileContent("icon.png"), content);
    assertEquals(4, link.getMetadata().size()); // only source, id, size and url attribute set.
  }

  /** throw non-recoverable exception when crawling a resource that does not exist. */
  public void testFetchMissingLink() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testFetchMissingLink", getClass().getName());
    final String url = "http://localhost:8765/files/no.such.resource";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    try {
      _fetcher.fetch(url, link, _webCrawlingContext);
      fail("should not work");
    } catch (final WebCrawlerException ex) {
      assertFalse(ex.isRecoverable());
    }
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlNoRedirect() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlNoRedirect", getClass().getName());
    // this link is redirected to "http://localhost:8765/files/"
    final String url = "http://localhost:8765/files";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    try {
      _fetcher.crawl(url, link, _webCrawlingContext);
      fail("should not work");
    } catch (final WebCrawlerException ex) {
      assertFalse(ex.isRecoverable());
    }
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlSimpleRedirect() throws Exception {
    final byte[] expectedContent = RedirectHandler.createStaticContent().getBytes();
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlSimpleRedirect", getClass().getName());
    final String url = "http://localhost:8765/smila/redirect?page=1";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);

    // create config
    final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
    configAny.put(FilterConfiguration.FOLLOW_REDIRECTS, true);
    final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
    taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
    initWebCrawlingContext(taskParameters);

    _fetcher.crawl(url, link, _webCrawlingContext);
    // check that returned URL is different than original URL
    assertEquals("http://localhost:8765/smila/redirect?page=0",
      link.getMetadata().getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0)));
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(expectedContent.length, content.length);
    Arrays.equals(expectedContent, content);
    assertEquals("text/html;charset=UTF-8",
      link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
    assertEquals("text/html", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
    assertEquals("UTF-8", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CHARSET));
    assertEquals(expectedContent.length, link.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE)
      .intValue());
    assertFalse(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED));
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlManyRedirects() throws Exception {
    final byte[] expectedContent = RedirectHandler.createStaticContent().getBytes();
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlSimpleRedirect", getClass().getName());
    final String url = "http://localhost:8765/smila/redirect?page=10";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);

    // create config
    final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
    configAny.put(FilterConfiguration.FOLLOW_REDIRECTS, true);
    configAny.put(FilterConfiguration.MAX_REDIRECTS, 10);
    final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
    taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
    initWebCrawlingContext(taskParameters);

    _fetcher.crawl(url, link, _webCrawlingContext);
    // check that returned URL is different than original URL
    assertEquals("http://localhost:8765/smila/redirect?page=0",
      link.getMetadata().getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0)));
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(expectedContent.length, content.length);
    Arrays.equals(expectedContent, content);
    assertEquals("text/html;charset=UTF-8",
      link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
    assertEquals("text/html", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
    assertEquals("UTF-8", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CHARSET));
    assertEquals(expectedContent.length, link.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE)
      .intValue());
    assertFalse(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED));
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlTooManyRedirects() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlSimpleRedirect", getClass().getName());
    final String url = "http://localhost:8765/smila/redirect?page=10";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);

    // create config
    final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
    configAny.put(FilterConfiguration.FOLLOW_REDIRECTS, true);
    configAny.put(FilterConfiguration.MAX_REDIRECTS, 5);
    final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
    taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
    initWebCrawlingContext(taskParameters);

    try {
      _fetcher.crawl(url, link, _webCrawlingContext);
      fail("expected RedirectException");
    } catch (final WebCrawlerException e) {
      assertTrue(e.getCause() instanceof RedirectException);
    }
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlFilteredRedirects() throws Exception {
    final byte[] expectedContent = RedirectHandler.createStaticContent().getBytes();
    final Record link = DataFactory.DEFAULT.createRecord("testCrawlSimpleRedirect", getClass().getName());
    final String url = "http://localhost:8765/smila/redirect?page=10";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);

    // create config
    final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
    configAny.put(FilterConfiguration.FOLLOW_REDIRECTS, true);
    configAny.put(FilterConfiguration.MAX_REDIRECTS, 10);
    final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
    configAny.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
    final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
    urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
    excludePatterns.add(".*page=5");
    final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
    taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
    initWebCrawlingContext(taskParameters);

    try {
      _fetcher.crawl(url, link, _webCrawlingContext);
      fail("expected RedirectException");
    } catch (final WebCrawlerException e) {
      assertTrue(e.getCause() instanceof RedirectException);
    }
  }

  /** crawl a simple HTML page, fetch content for extraction. */
  public void testCrawlVisitedRedirects() throws Exception {
    final byte[] expectedContent = RedirectHandler.createStaticContent().getBytes();

    // create config
    final AnyMap configAny = DataFactory.DEFAULT.createAnyMap();
    configAny.put(FilterConfiguration.FOLLOW_REDIRECTS, true);
    configAny.put(FilterConfiguration.MAX_REDIRECTS, 10);
    final AnyMap taskParameters = DataFactory.DEFAULT.createAnyMap();
    taskParameters.put(ImportingConstants.TASK_PARAM_FILTERS, configAny);
    initWebCrawlingContext(taskParameters);

    Record link = DataFactory.DEFAULT.createRecord("testCrawlSimpleRedirect", getClass().getName());
    String url = "http://localhost:8765/smila/redirect?page=2";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    _fetcher.crawl(url, link, _webCrawlingContext);

    // check that returned URL is different than original URL
    assertEquals("http://localhost:8765/smila/redirect?page=0",
      link.getMetadata().getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0)));
    assertTrue(link.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
    final byte[] content = link.getAttachmentAsBytes(WebCrawlerConstants.ATTACHMENT_CONTENT);
    assertEquals(expectedContent.length, content.length);
    Arrays.equals(expectedContent, content);
    assertEquals("text/html;charset=UTF-8",
      link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
    assertEquals("text/html", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
    assertEquals("UTF-8", link.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_CHARSET));
    assertEquals(expectedContent.length, link.getMetadata().getLongValue(WebCrawlerConstants.ATTRIBUTE_SIZE)
      .intValue());
    assertFalse(link.getMetadata().containsKey(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED));

    assertFalse(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=2",
      DUMMY_JOB_RUN_ID)); // initial url is marked by crawler worker
    assertTrue(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=1",
      DUMMY_JOB_RUN_ID));
    assertTrue(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=0",
      DUMMY_JOB_RUN_ID));

    // now test with a url that has more redirects and comes across redirects that have already been visited
    link = DataFactory.DEFAULT.createRecord("testCrawlSimpleRedirect", getClass().getName());
    url = "http://localhost:8765/smila/redirect?page=3";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);

    try {
      _fetcher.crawl(url, link, _webCrawlingContext);
      fail("expected RedirectException");
    } catch (final WebCrawlerException e) {
      assertTrue(e.getCause() instanceof RedirectException);
      assertFalse(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=3",
        DUMMY_JOB_RUN_ID)); // initial url is marked by crawler worker
      assertTrue(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=2",
        DUMMY_JOB_RUN_ID));
      assertTrue(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=1",
        DUMMY_JOB_RUN_ID));
      assertTrue(_visitedLinks.isVisited(DUMMY_DATA_SOURCE_ID, "http://localhost:8765/smila/redirect?page=0",
        DUMMY_JOB_RUN_ID));
    }
  }

  /** test the {@link ContentFetcher} method. */
  public void testContentFetcherHtml() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord("testContentFetcherHtml", getClass().getName());
    link.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, "http://localhost:8765/files/");
    final InputStream contentStream = _fetcher.getContent(link, _webCrawlingContext.getTaskContext());
    try {
      final byte[] content = IOUtils.toByteArray(contentStream);
      assertEquals(83, content.length);
      Arrays.equals(getFileContent("index.html"), content);
    } finally {
      contentStream.close();
    }
  }

  /** test the socket timeout, that has been configured to 5 sec (see webcrawler.properties). */
  public void testSocketTimeout() throws Exception {
    final Record link = DataFactory.DEFAULT.createRecord(getName(), getClass().getName());
    final String url = "http://localhost:8765/smila/wait/";
    link.getMetadata().put(_mapper.get(WebCrawlerConstants.ATTRIBUTE_URL).get(0), url);
    final long startTime = System.nanoTime();
    try {
      _fetcher.crawl(url, link, _webCrawlingContext);
      fail("should not work");
    } catch (final WebCrawlerException ex) {
      assertTrue(ex.getCause() instanceof SocketTimeoutException);
    }
    final long durationInSeconds = (System.nanoTime() - startTime) / 1000000000;

    assertTrue(durationInSeconds > 4);
    assertTrue(durationInSeconds < 7);
  }

  /** get content of a test file from file immediately. */
  private byte[] getFileContent(final String fileName) throws IOException {
    return IOUtils.toByteArray(ConfigUtils.getConfigStream(AllTests.BUNDLE_ID, "files/" + fileName));
  }

}
