/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web.test;

import java.io.InputStream;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants.ErrorHandling;
import org.eclipse.smila.importing.crawler.web.WebCrawlerWorker;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.test.httphandler.HostReplaceHandler;
import org.eclipse.smila.importing.crawler.web.test.util.RobotsTxtUtil;
import org.eclipse.smila.jobmanager.JobState;
import org.eclipse.smila.jobmanager.definitions.JobDefinition;
import org.eclipse.smila.jobmanager.definitions.JobManagerConstants;
import org.eclipse.smila.jobmanager.definitions.JobRunMode;
import org.eclipse.smila.objectstore.StoreObject;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.utils.config.ConfigUtils;
import org.eclipse.smila.utils.service.ServiceUtils;
import org.osgi.framework.ServiceReference;

/** test class for WebCrawlerWorker. */
public class TestWebCrawlerWorker extends WebWorkerTestBase {
  private static final String JOBNAME_CRAWLWEB = "crawlWeb";

  private static final String BUCKET_LINKS = "outgoingLinks/";

  private static final String BUCKET_RECORDS = "crawledRecords/";

  /** assert that we are testing the right implementation. */
  @SuppressWarnings("rawtypes")
  public void testService() throws Exception {
    final ServiceReference[] services = ServiceUtils.getServiceReferences(Worker.class);
    assertTrue("no worker services started.", services.length > 0);
    for (final ServiceReference service : services) {
      final Worker worker = ServiceUtils.getService(service, Worker.class);
      if (worker instanceof WebCrawlerWorker) {
        assertEquals("webCrawler", worker.getName());
        return; // worker found, test ok.
      }
    }
    fail("WebCrawlerWorker not found");
  }

  /** crawl a single page with one link. */
  public void testCrawlPageWithOneLink() throws Exception {
    runWebCrawlerJob("links1.html");
    checkOutgoingLinks(BASEURL + "index.html");
    checkCrawledRecords(1, BASEURL + "links1.html");
  }

  /** test with filter include pattern to filter out external link. */
  public void testCrawlPageWithFilteringExternalLink() throws Exception {
    final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
    final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
    filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
    final AnySeq includePatterns = DataFactory.DEFAULT.createAnySeq();
    urlPatterns.put(FilterConfiguration.INCLUDE_PATTERNS, includePatterns);
    includePatterns.add(BASEURL + ".*");

    runWebCrawlerJob("links2.html", filterParams);
    checkOutgoingLinks(BASEURL + "index.html");
    checkCrawledRecords(1, BASEURL + "links2.html");
  }

  /** crawl a single page with links of several types: with fragment, non-html, with parameters (filtered out). */
  public void testCrawlPageWithMixedLinks() throws Exception {
    final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
    final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
    filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
    final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
    urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
    excludePatterns.add(".*" + "\\?" + ".*");

    runWebCrawlerJob("links3.html", filterParams);
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
    checkCrawledRecords(1, BASEURL + "links3.html");
  }

  /**
   * crawl a single page with links of several types. This page will throw an IOError while parsing with tagsour parser.
   */
  public void testCrawlPageWithMixedLinksAndTagsoupParseError() throws Exception {
    final AnyMap filterParams = DataFactory.DEFAULT.createAnyMap();
    final AnyMap urlPatterns = DataFactory.DEFAULT.createAnyMap();
    filterParams.put(FilterConfiguration.URL_PATTERNS, urlPatterns);
    final AnySeq excludePatterns = DataFactory.DEFAULT.createAnySeq();
    urlPatterns.put(FilterConfiguration.EXCLUDE_PATTERNS, excludePatterns);
    excludePatterns.add(".*" + "\\?" + ".*");

    runWebCrawlerJob("links3_tagsouperror.html", filterParams);
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
    checkCrawledRecords(1, BASEURL + "links3_tagsouperror.html");
  }

  /** crawl with many links. */
  public void testCrawlPageWithManyLinks() throws Exception {
    runWebCrawlerJob("links11.html");
    checkOutgoingLinks(BASEURL + "page00.html", BASEURL + "page01.html", BASEURL + "page10.html", BASEURL
      + "page02.html", BASEURL + "page03.html", BASEURL + "page04.html", BASEURL + "page05.html", BASEURL
      + "page06.html", BASEURL + "page07.html", BASEURL + "page08.html", BASEURL + "page09.html");
    final List<Record> records = checkCrawledRecords(1, BASEURL + "links11.html");
    for (final Record record : records) {
      assertFalse(record.getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
    }
  }

  /** crawl with many links, stayOnHost. */
  public void testCrawlPageWithManyLinksStayOnHost() throws Exception {
    final String canonicalHostName = InetAddress.getLocalHost().getCanonicalHostName().toLowerCase();
    final AnyMap params = DataFactory.DEFAULT.createAnyMap();
    final String baseUrl = "http://" + canonicalHostName + ":8765/smila/hostreplace/";

    final AnyMap filtersAny = DataFactory.DEFAULT.createAnyMap();
    filtersAny.put(FilterConfiguration.STAY_ON, FilterConfiguration.StayOn.host.name());
    params.put(ImportingConstants.TASK_PARAM_FILTERS, filtersAny);

    runWebCrawlerJob("links11_moreHosts.html", params, baseUrl);
    checkOutgoingLinks(baseUrl + "page01.html", baseUrl + "page02.html", baseUrl + "page03.html", baseUrl
      + "page04.html", baseUrl + "page05.html", baseUrl + "page06.html", baseUrl + "page07.html", baseUrl
      + "page08.html");
    final List<Record> records = checkCrawledRecords(1, baseUrl + "links11_moreHosts.html");
    for (final Record record : records) {
      assertFalse(record.getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
    }
  }

  /** crawl with many links, stayOnDomain. */
  public void testCrawlPageWithManyLinksStayOnDomain() throws Exception {
    final String canonicalHostName = InetAddress.getLocalHost().getCanonicalHostName().toLowerCase();
    final String canoncialHostNameOtherHostFromThisDomain =
      canonicalHostName.replaceFirst("[^\\.]*\\.", HostReplaceHandler.I_DO_NOT_EXIST);
    final AnyMap params = DataFactory.DEFAULT.createAnyMap();
    final String baseUrl = "http://" + canonicalHostName + ":8765/smila/hostreplace/";
    final String baseUrlOtherHost =
      "http://" + canoncialHostNameOtherHostFromThisDomain + ":8765/smila/hostreplace/";

    final AnyMap filtersAny = DataFactory.DEFAULT.createAnyMap();
    filtersAny.put(FilterConfiguration.STAY_ON, FilterConfiguration.StayOn.domain.name());
    params.put(ImportingConstants.TASK_PARAM_FILTERS, filtersAny);

    runWebCrawlerJob("links11_moreHosts.html", params, baseUrl);
    checkOutgoingLinks(baseUrlOtherHost + "page00.html", baseUrl + "page01.html", baseUrlOtherHost + "page10.html",
      baseUrl + "page02.html", baseUrl + "page03.html", baseUrl + "page04.html", baseUrl + "page05.html", baseUrl
        + "page06.html", baseUrl + "page07.html", baseUrl + "page08.html", baseUrlOtherHost + "page09.html");
    final List<Record> records = checkCrawledRecords(1, baseUrl + "links11_moreHosts.html");
    for (final Record record : records) {
      assertFalse(record.getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
    }
  }

  /** crawl a plain-text resource. */
  public void testCrawlTextResource() throws Exception {
    runWebCrawlerJob("plain.txt");
    checkOutgoingLinks();
    final List<Record> records = checkCrawledRecords(1, BASEURL + "plain.txt");
    assertFalse(records.get(0).getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
  }

  /** crawl a binary resource. */
  public void testCrawlBinaryResource() throws Exception {
    runWebCrawlerJob("icon.png");
    checkOutgoingLinks();
    final List<Record> records = checkCrawledRecords(1, BASEURL + "icon.png");
    assertFalse(records.get(0).getMetadata().containsKey(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
  }

  /** crawl a single page with one link. */
  public void testCrawlCompoundWithOneLink() throws Exception {
    runWebCrawlerJob("document_in_compound.zip");
    checkOutgoingLinks();
    final List<Record> records = checkCrawledRecords(1, BASEURL + "document_in_compound.zip");
    for (final Record record : records) {
      assertTrue(record.getMetadata().getBooleanValue(ImportingConstants.ATTRIBUTE_COMPOUNDFLAG));
      assertFalse(record.hasAttachments());
      assertEquals("application/zip",
        record.getMetadata().getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE).get(0)));
      assertTrue(record.getMetadata().containsKey(_mapper.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED).get(0)));
    }
  }

  /** test a workflow where start URLs for webcrawler are provided by bulkbuilder. */
  public void testPushStartUrlsCrawlJob() throws Exception {
    final String jobName = "crawlMultipleStartUrls";
    final String jobRunId = _jobRunEngine.startJob(jobName);
    final Record startUrlRecord = DataFactory.DEFAULT.createRecord("startUrl");
    startUrlRecord.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASEURL + "links1.html");
    _bulkbuilder.addRecord(jobName, startUrlRecord);
    _bulkbuilder.commitJob(jobName);
    waitForTasksFinished(5000);
    checkOutgoingLinks();
    checkCrawledRecords(2, BASEURL + "links1.html", BASEURL + "index.html");
    _objectStore.clearStore(STORENAME);
    startUrlRecord.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASEURL + "links2.html");
    _bulkbuilder.addRecord(jobName, startUrlRecord);
    _bulkbuilder.commitJob(jobName);
    waitForTasksFinished(5000);
    checkOutgoingLinks();
    checkCrawledRecords(1, BASEURL + "links2.html"); // points to index.html, but that was visited already
    _objectStore.clearStore(STORENAME);
    startUrlRecord.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASEURL + "links11.html");
    _bulkbuilder.addRecord(jobName, startUrlRecord);
    _bulkbuilder.commitJob(jobName);
    waitForTasksFinished(5000);
    checkOutgoingLinks();
    checkCrawledRecords(1, BASEURL + "links11.html"); // points to 10 unknown pages, but that should not disturb job
                                                      // run.
    _jobRunEngine.finishJob(jobName, jobRunId);
    waitForJobRunSucceeded(jobName, jobRunId, 10000);
    assertEquals(3,
      _jobRunDataProvider.getJobRunData(jobName, jobRunId).getMap(JobManagerConstants.WORKFLOW_RUN_COUNTER)
        .getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_WORKFLOW_RUNS).intValue());
  }

  /** test a workflow where start URLs for webcrawler are provided by bulkbuilder. */
  public void testPushStartUrlsCrawlJobMultipleJobRuns() throws Exception {
    final String jobName = "crawlMultipleStartUrls";
    final String jobRun1Id = _jobRunEngine.startJob(jobName);
    final Record startUrlRecord = DataFactory.DEFAULT.createRecord("startUrl");
    startUrlRecord.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASEURL + "links1.html");
    _bulkbuilder.addRecord(jobName, startUrlRecord);
    _bulkbuilder.commitJob(jobName);
    waitForTasksFinished(5000);
    checkOutgoingLinks();
    checkCrawledRecords(2, BASEURL + "links1.html", BASEURL + "index.html");
    _jobRunEngine.finishJob(jobName, jobRun1Id);
    _objectStore.clearStore(STORENAME);
    waitForJobRunSucceeded(jobName, jobRun1Id, 10000);
    final String jobRun2Id = _jobRunEngine.startJob(jobName);
    _bulkbuilder.addRecord(jobName, startUrlRecord);
    startUrlRecord.getMetadata().put(WebCrawlerConstants.ATTRIBUTE_URL, BASEURL + "links2.html");
    _bulkbuilder.addRecord(jobName, startUrlRecord);
    _bulkbuilder.commitJob(jobName);
    waitForTasksFinished(5000);
    checkOutgoingLinks();
    checkCrawledRecords(2, BASEURL + "links1.html", BASEURL + "index.html", BASEURL + "links2.html");
    _jobRunEngine.finishJob(jobName, jobRun2Id);
    _objectStore.clearStore(STORENAME);
    waitForJobRunSucceeded(jobName, jobRun2Id, 10000);
  }

  /** test if crawler uses configurable "User-Agent" string. */
  public void testUserAgentHeader() throws Exception {
    final Properties props =
      ConfigUtils.getConfigProperties("org.eclipse.smila.importing.crawler.web", "webcrawler.properties");
    final String expectedUserAgent = props.getProperty("userAgent");
    assertFalse(expectedUserAgent.equals(WebCrawlerConstants.DEFAULT_USERAGENT));
    runWebCrawlerJob("smila/useragent", null, TESTSERVER);
    final List<StoreObject> bulks = getSortedBulks(BUCKET_RECORDS, 1);
    assertEquals(1, bulks.size());
    try (BinaryObjectStreamIterator bulkReader =
      new BinaryObjectStreamIterator(_objectStore.readObject(STORENAME, bulks.get(0).getId()))) {
      assertTrue(bulkReader.hasNext());
      final Record r = bulkReader.next();
      for (final String attachmentName : _mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
        assertEquals(expectedUserAgent, new String(r.getAttachmentAsBytes(attachmentName), "utf-8"));
      }
      assertFalse(bulkReader.hasNext());
    }
  }

  /** crawl tests concerning robots.txt. */
  public void testRobotsTxt() throws Exception {
    final Map<String, List<String>> robotsMap = new LinkedHashMap<>();
    // without robots.txt (-> 404)
    runWebCrawlerJob("links_robots.html");
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
    checkCrawledRecords(1, BASEURL + "links_robots.html");
    resetStore();
    robotsMap.clear();

    // with robots.txt that disallows anything
    robotsMap.clear();
    robotsMap.put("SMILA", Arrays.asList("/"));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html", JobState.FAILED);
    checkOutgoingLinks();
    checkCrawledRecords(0);
    resetStore();
    robotsMap.clear();

    // with robots.txt that allows anything
    robotsMap.put("SMILA", Arrays.asList(" "));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html");
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
    checkCrawledRecords(1, BASEURL + "links_robots.html");
    resetStore();
    robotsMap.clear();

    // with robots.txt that disallows icon.png
    robotsMap.clear();
    robotsMap.put("SMILA", Arrays.asList("/files/icon.png"));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html");
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt");
    checkCrawledRecords(1, BASEURL + "links_robots.html");
    resetStore();
    robotsMap.clear();

    // with robots.txt that disallows icon.png and index.html
    robotsMap.put("SMILA", Arrays.asList("/files/icon.png", "/files/index.html"));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html");
    checkOutgoingLinks(BASEURL + "plain.txt");
    checkCrawledRecords(1, BASEURL + "links_robots.html");
    resetStore();
    robotsMap.clear();

    // with robots.txt having User-Agent that doesn't fit
    robotsMap.put("Google", Arrays.asList("/"));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html");
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
    checkCrawledRecords(1, BASEURL + "links_robots.html");
    resetStore();
    robotsMap.clear();

    // with robots.txt having User-Agent '*'
    robotsMap.put("*", Arrays.asList("/"));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html", JobState.FAILED);
    checkOutgoingLinks();
    checkCrawledRecords(0);
    resetStore();
    robotsMap.clear();

    // with robots.txt having different User-Agents
    robotsMap.put("Google", Arrays.asList("/"));
    robotsMap.put("SMILA", Arrays.asList(" "));
    robotsMap.put("Nutch", Arrays.asList("/"));
    RobotsTxtUtil.putRobotsTxt(robotsMap);
    runWebCrawlerJob("links_robots.html");
    checkOutgoingLinks(BASEURL + "index.html", BASEURL + "plain.txt", BASEURL + "icon.png");
    checkCrawledRecords(1, BASEURL + "links_robots.html");
    resetStore();
    robotsMap.clear();

    // 401 when reading robots.txt
    RobotsTxtUtil.postRobotsTxtException("401");
    runWebCrawlerJob("links_robots.html", JobState.FAILED);
    checkOutgoingLinks();
    checkCrawledRecords(0);
    resetStore();

    // 403 when reading robots.txt
    RobotsTxtUtil.postRobotsTxtException("403");
    runWebCrawlerJob("links_robots.html", JobState.FAILED);
    checkOutgoingLinks();
    checkCrawledRecords(0);
    resetStore();
  }

  /** check if initial task is retried on IO errors with "linkErrorHandling"="retry". */
  public void testRetryStartUrl() throws Exception {
    final AnyMap jobData =
      runErrorHandlingWebCrawlerJob("http://abc.lmn.xyz/index.html", ErrorHandling.RETRY, JobState.FAILED);
    final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED)
      .intValue());
    assertEquals(1, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
    assertEquals(10, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
  }

  /** check if initial task is failed immediately on IO errors with "linkErrorHandling"="drop". */
  public void testDropStartUrl() throws Exception {
    final AnyMap jobData =
      runErrorHandlingWebCrawlerJob("http://abc.lmn.xyz/index.html", ErrorHandling.DROP, JobState.FAILED);
    final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
    assertEquals(1, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED)
      .intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
  }

  /**
   * check that default errorhandling is drop: test is identical to {@link #testDropStartUrl()}, but doesn't set
   * explicit mode.
   */
  public void testDropIsDefault() throws Exception {
    final AnyMap jobData = runErrorHandlingWebCrawlerJob("http://abc.lmn.xyz/index.html", null, JobState.FAILED);
    final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
    assertEquals(1, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED)
      .intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
  }

  /** check if invalid links found in crawled pages are retried with "linkErrorHandling"="retry". */
  public void testRetryCrawledUrl() throws Exception {
    final AnyMap jobData =
      runErrorHandlingWebCrawlerJob(BASEURL + "invalid-link.html", ErrorHandling.RETRY, JobState.FAILED);
    final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
    assertEquals(1, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED)
      .intValue());
    assertEquals(1, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
    assertEquals(10, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
  }

  /** check if invalid links found in crawled pages are retried with "linkErrorHandling"="retry". */
  public void testDropCrawledUrl() throws Exception {
    final AnyMap jobData =
      runErrorHandlingWebCrawlerJob(BASEURL + "invalid-link.html", ErrorHandling.DROP, JobState.SUCCEEDED);
    final AnyMap taskCounter = jobData.getMap(JobManagerConstants.TASK_COUNTER);
    assertEquals(2, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_SUCCESSFUL_TASKS).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_NOT_RETRIED)
      .intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_FAILED_TASKS_RETRIED).intValue());
    assertEquals(0, taskCounter.getLongValue(JobManagerConstants.DATA_JOB_NO_OF_RETRIED_TASKS_WORKER).intValue());
  }

  /** start a webcrawler job from template in config and wait until it is finished. */
  private AnyMap runErrorHandlingWebCrawlerJob(final String startUrl, final ErrorHandling errorHandling,
    final JobState expectedState) throws Exception {
    final AnyMap params = DataFactory.DEFAULT.createAnyMap();
    if (errorHandling != null) {
      params.put(WebCrawlerConstants.TASK_PARAM_LINK_ERROR_HANDLING, errorHandling.toString().toLowerCase());
    }
    return runWebCrawlerJob(startUrl, JOBNAME_CRAWLWEB + "2", params, "", expectedState);
  }

  /** start a webcrawler job from template in config and wait until it is finished. */
  private AnyMap runWebCrawlerJob(final String startFile) throws Exception {
    return runWebCrawlerJob(startFile, (AnyMap) null);
  }

  /** start a webcrawler job from template in config and wait until it is finished. */
  private AnyMap runWebCrawlerJob(final String startFile, final AnyMap filterParams) throws Exception {
    final AnyMap params = DataFactory.DEFAULT.createAnyMap();
    if (filterParams != null) {
      params.put("filters", filterParams);
    }
    return runWebCrawlerJob(startFile, params, BASEURL);
  }

  /** start a webcrawler job from template in config and wait until it is finished. */
  private AnyMap runWebCrawlerJob(final String startFile, final JobState expectedState) throws Exception {
    return runWebCrawlerJob(startFile, null, BASEURL, expectedState);
  }

  /** starts a web crawler job with given parameters and startUrl. */
  private AnyMap runWebCrawlerJob(final String startFile, final AnyMap params, final String baseUrl)
    throws Exception {
    return runWebCrawlerJob(startFile, params, baseUrl, JobState.SUCCEEDED);
  }

  /** starts a web crawler job with given parameters and startUrl. */
  private AnyMap runWebCrawlerJob(final String startFile, final AnyMap params, final String baseUrl,
    final JobState expectedState) throws Exception {
    return runWebCrawlerJob(startFile, JOBNAME_CRAWLWEB, params, baseUrl, expectedState);
  }

  /** starts a web crawler job with given parameters and startUrl. */
  private AnyMap runWebCrawlerJob(final String startFile, final String jobNameBase, final AnyMap params,
    final String baseUrl, final JobState expectedState) throws Exception {
    final JobDefinition jobTemplate = _defPersistence.getJob(jobNameBase + "Template");
    final String jobName = jobNameBase + System.nanoTime();
    final AnyMap jobAny = DataFactory.DEFAULT.cloneAnyMap(jobTemplate.toAny(false));
    jobAny.put("name", jobName);
    if (params != null) {
      jobAny.getMap("parameters").putAll(params);
    }
    jobAny.getMap("parameters").put("startUrl", baseUrl + startFile);
    final JobDefinition job = new JobDefinition(jobAny);
    _defPersistence.addJob(job);
    final String jobRunId = _jobRunEngine.startJob(jobName, JobRunMode.RUNONCE);
    waitForJobRunEnded(jobName, jobRunId, expectedState, 10000);
    return _jobRunDataProvider.getJobRunData(jobName, jobRunId);
  }

  /** Waits for a job to be completed. */
  private void waitForJobRunSucceeded(final String jobName, final String jobId, final long maxWaitTime)
    throws Exception {
    waitForJobRunEnded(jobName, jobId, JobState.SUCCEEDED, maxWaitTime);
  }

  /** Waits for a job to be completed. */
  private void waitForJobRunEnded(final String jobName, final String jobId, final JobState expectedState,
    final long maxWaitTime) throws Exception {
    final long sleepTime = 500L;
    final long millisStarted = System.currentTimeMillis();
    while (true) {
      final Collection<String> completedIds = _jobRunDataProvider.getCompletedJobRunIds(jobName);
      if (completedIds.contains(jobId)) {
        final AnyMap runData = _jobRunDataProvider.getJobRunData(jobName, jobId);
        final String jobRunState = runData.getStringValue(JobManagerConstants.DATA_JOB_STATE);
        assertEquals(expectedState, JobState.valueOf(jobRunState));
        return;
      }
      assertTrue("Waited too long for job to complete", System.currentTimeMillis() - millisStarted <= maxWaitTime);
      Thread.sleep(sleepTime);
    }
  }

  /**
   * check number and content of linksToCrawl bulk. Expects each link in a seperate bulk. Outgoing link records are NOT
   * mapped!
   */
  private void checkOutgoingLinks(final String... urls) throws Exception {
    final int expectedNumberOfBulks = urls.length;
    final List<StoreObject> bulks = getSortedBulks(BUCKET_LINKS, expectedNumberOfBulks);
    if (expectedNumberOfBulks == 0) {
      assertEquals("Too many bulks", 0, bulks.size());
    } else {
      int recordCount = 0;
      for (final StoreObject bulk : bulks) {
        try (InputStream bulkStream = _objectStore.readObject(STORENAME, bulk.getId());
          BinaryObjectStreamIterator bulkReader = new BinaryObjectStreamIterator(bulkStream)) {
          while (bulkReader.hasNext()) {
            final Record linkRecord = bulkReader.next();
            System.out.println("Outgoing Link: " + linkRecord);
            assertTrue("Too many links", recordCount < urls.length);
            assertEquals(urls[recordCount],
              linkRecord.getMetadata().getStringValue(WebCrawlerConstants.ATTRIBUTE_URL));
            recordCount++;
          }
        }
      }
      assertEquals("Too few links.", recordCount, urls.length);
    }
  }

  /** check number and content of crawledRecords bulk. */
  private List<Record> checkCrawledRecords(final int expectedNumberOfBulks, final String... urls) throws Exception {
    final List<StoreObject> bulks = getSortedBulks(BUCKET_RECORDS, expectedNumberOfBulks);
    final List<Record> records = new ArrayList<Record>();
    final Set<String> expectedUrls = new HashSet<>(Arrays.asList(urls));
    if (expectedNumberOfBulks > 0) {
      int recordCount = 0;
      for (final StoreObject bulk : bulks) {
        try (InputStream bulkStream = _objectStore.readObject(STORENAME, bulk.getId());
          BinaryObjectStreamIterator bulkReader = new BinaryObjectStreamIterator(bulkStream)) {
          while (bulkReader.hasNext()) {
            int size = -1;
            final Record crawledRecord = bulkReader.next();
            System.out.println("Crawled Record: " + crawledRecord);
            assertTrue("Too many records", recordCount < urls.length);
            assertEquals("web", crawledRecord.getSource());
            final AnyMap metadata = crawledRecord.getMetadata();
            for (final String mappedUrlAttribute : _mapper.get(WebCrawlerConstants.ATTRIBUTE_URL)) {
              assertTrue(expectedUrls.contains(metadata.getStringValue(mappedUrlAttribute)));
            }
            for (final String mappedSizeAttribute : _mapper.get(WebCrawlerConstants.ATTRIBUTE_SIZE)) {
              assertTrue(metadata.get(mappedSizeAttribute).isLong());
              if (size < 0) {
                size = metadata.getLongValue(mappedSizeAttribute).intValue();
              } else {
                assertEquals(size, metadata.getLongValue(mappedSizeAttribute).intValue());
              }
            }
            assertTrue(metadata.get(_mapper.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED).get(0)).isDateTime());
            assertTrue(metadata.containsKey(ImportingConstants.ATTRIBUTE_DELTA_HASH));
            final String mimetype =
              metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_MIMETYPE).get(0));
            assertTrue(metadata.getStringValue(_mapper.get(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE).get(0))
              .startsWith(mimetype));
            if ("text/html".equals(mimetype)) {
              for (final String attachmentName : _mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
                assertTrue(crawledRecord.hasAttachment(attachmentName));
                assertEquals(crawledRecord.getAttachmentAsBytes(attachmentName).length, size);
              }
            } else {
              for (final String attachmentName : _mapper.get(WebCrawlerConstants.ATTACHMENT_CONTENT)) {
                assertFalse(crawledRecord.hasAttachment(attachmentName));
              }
            }
            recordCount++;
            records.add(crawledRecord);
          }
        }
      }
      assertEquals("Too few records.", recordCount, urls.length);
    }
    return records;
  }

}
