/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.test;

import java.io.InputStream;
import java.net.URL;
import java.util.Collection;

import org.apache.commons.io.IOUtils;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.ipc.BinaryObjectStreamIterator;
import org.eclipse.smila.http.server.HttpService;
import org.eclipse.smila.importing.ImportingConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerWorker;
import org.eclipse.smila.importing.crawler.web.WebFetcherWorker;
import org.eclipse.smila.jobmanager.definitions.JobDefinition;
import org.eclipse.smila.jobmanager.definitions.JobRunMode;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.objectstore.StoreObject;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.utils.service.ServiceUtils;
import org.osgi.framework.ServiceReference;

public class TestWebCrawling extends AImportingIntegrationTest {

  private static final String BASE_URL = "http://localhost:8765/web/";

  private static final String JOBNAME_CRAWL_WEB = "crawlWeb";

  private static final String DATA_SOURCE = "web";

  public void testWorkers() throws Exception {
    final ServiceReference[] services = ServiceUtils.getServiceReferences(Worker.class);
    assertTrue("no worker services started.", services.length > 0);
    boolean foundCrawler = false;
    boolean foundFetcher = false;
    for (final ServiceReference service : services) {
      final Worker worker = ServiceUtils.getService(service, Worker.class);
      if (worker instanceof WebCrawlerWorker) {
        assertEquals("webCrawler", worker.getName());
        foundCrawler = true;
      }
      if (worker instanceof WebFetcherWorker) {
        assertEquals("webFetcher", worker.getName());
        foundFetcher = true;
      }
    }
    assertTrue("WebCrawlerWorker not found", foundCrawler);
    assertTrue("WebFetcherWorker not found", foundFetcher);
  }

  /** just check if the test web server has been started correctly. */
  public void testWebServerAvailable() throws Exception {
    getService(HttpService.class);
    final URL testUrl = new URL(BASE_URL);
    final InputStream testStream = testUrl.openStream();
    try {
      final byte[] content = IOUtils.toByteArray(testStream);
      assertTrue(content.length > 0);
    } finally {
      IOUtils.closeQuietly(testStream);
    }
  }

  public void testCrawlSimple() throws Exception {
    crawlUrl(BASE_URL + "simple/");
    assertTrue(_deltaService.getSourceIds().contains(DATA_SOURCE));
    final int expectedRecordCount = 6;
    checkAddedBulks(expectedRecordCount);
  }

  public void testCrawlWithCycles() throws Exception {
    crawlUrl(BASE_URL + "loops/");
    assertTrue(_deltaService.getSourceIds().contains(DATA_SOURCE));
    final int expectedRecordCount = 6;
    checkAddedBulks(expectedRecordCount);
  }

  public void testCrawlWithNonHtml() throws Exception {
    crawlUrl(BASE_URL + "nonhtml/");
    assertTrue(_deltaService.getSourceIds().contains(DATA_SOURCE));
    final int expectedRecordCount = 3;
    checkAddedBulks(expectedRecordCount);
  }

  public void notAutomatic_testAdhoc() throws Exception {
    final String crawlJobId = startWebCrawlerJob("http://empgt-dev18/IASDoc/index.html");
    waitForJobRunCompleted(JOBNAME_CRAWL_WEB, crawlJobId, 60000);
    _bulkbuilder.commitJob(JOBNAME_BUILDBULKS);
    final int expectedRecordCount = 176;
    checkAddedBulks(expectedRecordCount);
  }

  private void crawlUrl(final String url) throws Exception {
    final String crawlJobId = startWebCrawlerJob(url);
    waitForJobRunCompleted(JOBNAME_CRAWL_WEB, crawlJobId, 15000);
    _bulkbuilder.commitJob(JOBNAME_BUILDBULKS);
  }

  private String startWebCrawlerJob(final String url) throws Exception {
    final JobDefinition jobTemplate = _defPersistence.getJob(JOBNAME_CRAWL_WEB + "Template");
    final AnyMap jobAny = jobTemplate.toAny(false);
    jobAny.put("name", JOBNAME_CRAWL_WEB);
    jobAny.getMap("parameters").put("startUrl", url);
    final JobDefinition job = new JobDefinition(jobAny);
    _defPersistence.addJob(job);
    return _jobRunEngine.startJob(JOBNAME_CRAWL_WEB, JobRunMode.RUNONCE);
  }

  private void checkAddedBulks(final int expectedRecordCount) throws ObjectStoreException, Exception {
    final Collection<StoreObject> bulks = _objectStore.getStoreObjectInfos(STORENAME_BULKS, BUCKET_ADDED);
    assertNotNull(bulks);
    if (expectedRecordCount == 0) {
      assertTrue(bulks.isEmpty());
    } else {
      assertEquals(expectedRecordCount, checkRecords(bulks, false));
    }
  }

  private int checkRecords(final Collection<StoreObject> bulks, final boolean update) throws Exception {
    int recordCount = 0;
    for (final StoreObject bulk : bulks) {
      final InputStream bulkStream = _objectStore.readObject(STORENAME_BULKS, bulk.getId());
      try {
        final BinaryObjectStreamIterator records = new BinaryObjectStreamIterator(bulkStream);
        while (records.hasNext()) {
          final Record record = records.next();
          assertNotNull(record);
          recordCount++;
          assertNotNull(record.getId());
          assertEquals(DATA_SOURCE, record.getSource());
          final AnyMap metadata = record.getMetadata();
          if (update) {
            assertTrue(metadata.getBooleanValue(ImportingConstants.ATTRIBUTE_UPDATE));
          } else {
            assertFalse(metadata.containsKey(ImportingConstants.ATTRIBUTE_UPDATE));
          }
          assertTrue(metadata.containsKey(ImportingConstants.ATTRIBUTE_DELTA_HASH));
          assertTrue(metadata.containsKey(WebCrawlerConstants.ATTRIBUTE_URL));
          assertTrue(metadata.get(WebCrawlerConstants.ATTRIBUTE_LASTMODIFIED).isDateTime());
          // not reported by our webserver.
          // assertTrue(metadata.containsKey(WebCrawlerConstants.ATTRIBUTE_CHARSET));
          assertTrue(metadata.containsKey(WebCrawlerConstants.ATTRIBUTE_MIMETYPE));
          assertTrue(metadata.containsKey(WebCrawlerConstants.ATTRIBUTE_CONTENTTYPE));
          assertTrue(metadata.get(WebCrawlerConstants.ATTRIBUTE_SIZE).isLong());
          assertTrue(record.hasAttachment(WebCrawlerConstants.ATTACHMENT_CONTENT));
        }
      } finally {
        IOUtils.closeQuietly(bulkStream);
      }
    }
    return recordCount;
  }

}
