/*********************************************************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
 * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
 * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 *********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web.test;

import java.io.InputStream;
import java.util.List;

import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
import org.eclipse.smila.jobmanager.definitions.JobDefinition;
import org.eclipse.smila.jobmanager.definitions.JobManagerConstants;
import org.eclipse.smila.objectstore.StoreObject;

/**
 * Tests for {@link org.eclipse.smila.importing.crawler.web.WebFetcherWorker}.
 *
 * @author scum36
 *
 */
public class TestWebFetcherWorker extends WebWorkerTestBase {
  private static final String JOBNAME = "fetchWeb";

  private static final String BUCKET = "fetchedLinks";

  /**
   * check that with "linkErrorHandling"="retry" the task is retried if a record cannot be fetched because of an IO
   * error.
   */
  public void testInvalidLinkRetryOnIOError() throws Exception {
    final String jobRunId = startJob(ErrorHandling.RETRY);
    try {
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "links1.html")); // so that task doesn't fail because
                                                                              // output is empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(INVALID_URL));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      final AnyMap jobData = _jobRunDataProvider.getJobRunData(JOBNAME, jobRunId);
      final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
      assertEquals("Counter: " + taskCounter, 1,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
      assertEquals("Counter: " + taskCounter, 0,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED).intValue());
      assertEquals("Counter: " + taskCounter, 1,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
      assertEquals("Counter: " + taskCounter, 10,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
      checkFetchedRecords(0);
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /**
   * check that with "linkErrorHandling"="ignore" the task will not be retried in case of error and the record will be
   * written to the output anyway.
   */
  public void testInvalidLinkIgnoreOnIOError() throws Exception {
    final String jobRunId = startJob(ErrorHandling.IGNORE);
    try {
      // since this is the only record, this would lead to task fail under different settings than ignore.
      // bcs the output would be empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(INVALID_URL));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      final AnyMap jobData = _jobRunDataProvider.getJobRunData(JOBNAME, jobRunId);
      final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
      // 1x webFetcher + 1 xBB
      assertEquals("Counter: " + taskCounter, 2,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
      assertEquals("Counter: " + taskCounter, 0,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED).intValue());
      assertEquals("Counter: " + taskCounter, 0,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
      assertEquals("Counter: " + taskCounter, 0,
        taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
      checkFetchedEmptyRecords(1, INVALID_URL);
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /**
   * check that with "linkErrorHandling"="drop" the task is not retried if a record cannot be fetched because of an IO
   * error.
   */
  public void testInvalidLinkDropOnIOError() throws Exception {
    final String jobRunId = startJob(ErrorHandling.DROP);
    try {
      // so that task doesn't fail because output is empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "links1.html"));
      _bulkbuilder.addRecord(JOBNAME, createRecord(INVALID_URL));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      assertTasksSuccessful(jobRunId);
      checkFetchedRecords(1, BASEURL + "links1.html");
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /**
   * check that "linkErrorHandling"="drop" is the default value. This test is identical to
   * {@link #testInvalidLinkDropOnIOError()} apart from the job initialisation.
   */
  public void testDefaultIsDrop() throws Exception {
    final String jobRunId = startJob(null);
    try {
      // so that task doesn't fail because output is empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "links1.html"));
      _bulkbuilder.addRecord(JOBNAME, createRecord(INVALID_URL));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      assertTasksSuccessful(jobRunId);
      checkFetchedRecords(1, BASEURL + "links1.html");
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /**
   * check that with "linkErrorHandling"="retry" the task is not retried if a record cannot be fetched because of an
   * bad-request-error, and that the invalid record is not dropped.
   */
  public void testInvalidLinkNoRetryOnNotFoundError() throws Exception {
    final String jobRunId = startJob(ErrorHandling.RETRY);
    try {
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "links1.html")); // so that task doesn't fail because
                                                                              // output is empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "no-such-link"));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      assertTasksSuccessful(jobRunId);
      checkFetchedRecords(1, BASEURL + "links1.html", BASEURL + "no-such-link");
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /**
   * check that with "linkErrorHandling"="drop" the task is not retried if a record cannot be fetched because of an
   * bad-request-error, and that the invalid record is not dropped.
   */
  public void testInvalidLinkNoDropOnNotFoundError() throws Exception {
    final String jobRunId = startJob(ErrorHandling.DROP);
    try {
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "links1.html")); // so that task doesn't fail because
                                                                              // output is empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "no-such-link"));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      assertTasksSuccessful(jobRunId);
      checkFetchedRecords(1, BASEURL + "links1.html", BASEURL + "no-such-link");
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /**
   * check the job succeeds on invalid URIs.
   */
  public void testInvalidUriNoDrop() throws Exception {
    final String jobRunId = startJob(ErrorHandling.DROP);
    try {
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "links1.html")); // so that task doesn't fail because
                                                                              // output is empty
      _bulkbuilder.addRecord(JOBNAME, createRecord(BASEURL + "document[id]/4711/"));
      _bulkbuilder.commitJob(JOBNAME);
      waitForTasksFinished(7000);
      assertTasksSuccessful(jobRunId);
      checkFetchedRecords(1, BASEURL + "links1.html", BASEURL + "document[id]/4711/");
    } finally {
      _jobRunEngine.cancelJob(JOBNAME, jobRunId);
    }
  }

  /** create a record with a URL to fetch. */
  private Record createRecord(final String url) {
    final Record record = DataFactory.DEFAULT.createRecord("web:" + url, "web");
    record.getMetadata().put("httpUrl", url);
    return record;
  }

  /** start a bulkbuilder+webFetcher job with the given error handling mode. */
  private String startJob(final ErrorHandling errorHandlingMode) throws Exception {
    final AnyMap jobDef = DataFactory.DEFAULT.createAnyMap();
    jobDef.put("name", JOBNAME);
    jobDef.put("workflow", "webFetching");
    final AnyMap params = jobDef.getMap("parameters", true);
    params.put("tempStore", STORENAME);
    params.put("store", STORENAME);
    params.put("dataSource", "web");
    if (errorHandlingMode != null) {
      params.put(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING, errorHandlingMode.toString().toLowerCase());
    }
    final AnyMap mapping = params.getMap("mapping", true);
    mapping.put("httpUrl", "httpUrl");
    mapping.put("httpContent", "httpContent");
    _defPersistence.addJob(new JobDefinition(jobDef));
    return _jobRunEngine.startJob(JOBNAME);
  }

  private void assertTasksSuccessful(final String jobRunId) throws Exception {
    final AnyMap jobData = _jobRunDataProvider.getJobRunData(JOBNAME, jobRunId);
    final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
    assertEquals(2, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED)
      .intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
  }

  /** check number and content of crawledRecords bulk. */
  private void checkFetchedRecords(final int expectedNumberOfBulks, final String... urls) throws Exception {
    final List<StoreObject> bulks = getSortedBulks(BUCKET, expectedNumberOfBulks);
    int recordCount = 0;
    if (expectedNumberOfBulks > 0) {
      for (final StoreObject bulk : bulks) {
        try (InputStream bulkStream = _objectStore.readObject(STORENAME, bulk.getId());
          BinaryObjectStreamIterator bulkReader = new BinaryObjectStreamIterator(bulkStream)) {
          while (bulkReader.hasNext()) {
            final Record record = bulkReader.next();
            assertTrue("found more records than expected", recordCount < urls.length);
            assertEquals(urls[recordCount], record.getMetadata().getStringValue("httpUrl"));
            assertEquals(urls[recordCount].endsWith(".html"), record.hasAttachment("httpContent"));
            recordCount++;
          }
        }
      }
    }
    assertEquals("found too few records", recordCount, urls.length);
  }

  /** check number and 'not set content' in crawledRecords bulk. */
  private void checkFetchedEmptyRecords(final int expectedNumberOfBulks, final String... urls) throws Exception {
    final List<StoreObject> bulks = getSortedBulks(BUCKET, expectedNumberOfBulks);
    int recordCount = 0;
    if (expectedNumberOfBulks > 0) {
      for (final StoreObject bulk : bulks) {
        try (InputStream bulkStream = _objectStore.readObject(STORENAME, bulk.getId());
          BinaryObjectStreamIterator bulkReader = new BinaryObjectStreamIterator(bulkStream)) {
          while (bulkReader.hasNext()) {
            final Record record = bulkReader.next();
            assertTrue("found more records than expected", recordCount < urls.length);
            assertEquals(urls[recordCount], record.getMetadata().getStringValue("httpUrl"));
            assertEquals(urls[recordCount].endsWith(".html"), !record.hasAttachment("httpContent"));
            recordCount++;
          }
        }
      }
    }
    assertEquals("found too few records", recordCount, urls.length);
  }
}
