/*******************************************************************************
 * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Empolis Information GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.test;

import java.io.InputStream;

import junit.framework.TestCase;

import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
import org.eclipse.smila.utils.config.ConfigUtils;

public class TestRobotsTxt extends TestCase {

  public void testNoRobotsTxt() throws Exception {
    final RobotsTxt robotsTxt = new RobotsTxt();
    assertTrue(robotsTxt.isAllowed("/"));
    assertTrue(robotsTxt.isAllowed("/forum"));
    assertTrue(robotsTxt.isAllowed("/index.html"));
    assertTrue(robotsTxt.isAllowed("/search?q=answer"));
    assertTrue(robotsTxt.isAllowed("test"));
  }

  public void testForbiddenRobotsTxt() throws Exception {
    final RobotsTxt robotsTxt = new RobotsTxt(true);
    assertFalse(robotsTxt.isAllowed("/"));
    assertFalse(robotsTxt.isAllowed("/forum"));
    assertFalse(robotsTxt.isAllowed("/index.html"));
    assertFalse(robotsTxt.isAllowed("/search?q=answer"));
    assertFalse(robotsTxt.isAllowed("test"));
  }

  public void testEmptyRobotsTxt() throws Exception {
    try (InputStream stream = getRobotsTxtStream("empty.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
      assertTrue(robotsTxt.isAllowed("test"));
    }
  }

  public void testAllowAllRobotsTxt() throws Exception {
    try (InputStream stream = getRobotsTxtStream("allowAll.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
      assertTrue(robotsTxt.isAllowed("test"));
    }
  }

  public void testAllowSmilaRobotsTxt() throws Exception {
    try (InputStream stream = getRobotsTxtStream("allowSmila.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
      assertTrue(robotsTxt.isAllowed("test"));
    }
  }

  public void testDisallowAllRobotsTxt() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowAll.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertFalse(robotsTxt.isAllowed("/"));
      assertFalse(robotsTxt.isAllowed("/forum"));
      assertFalse(robotsTxt.isAllowed("/index.html"));
      assertFalse(robotsTxt.isAllowed("/search?q=answer"));
      assertFalse(robotsTxt.isAllowed("test"));
    }
  }

  public void testDisallowSmilaRobotsTxt() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowSmila.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertFalse(robotsTxt.isAllowed("/"));
      assertFalse(robotsTxt.isAllowed("/forum"));
      assertFalse(robotsTxt.isAllowed("/index.html"));
      assertFalse(robotsTxt.isAllowed("/search?q=answer"));
      assertFalse(robotsTxt.isAllowed("test"));
    }
  }

  public void testDisallowOtherRobotsTxt() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowOther.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
      assertTrue(robotsTxt.isAllowed("test"));
    }
  }

  public void testDisallowOtherRobotsTxtAsOther() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowOther.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt("Some other robot", stream);
      assertFalse(robotsTxt.isAllowed("/"));
      assertFalse(robotsTxt.isAllowed("/forum"));
      assertFalse(robotsTxt.isAllowed("/index.html"));
      assertFalse(robotsTxt.isAllowed("/search?q=answer"));
      assertFalse(robotsTxt.isAllowed("test"));
    }
  }

  public void testDisallowPartsForAll() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowPartsForAll.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertFalse(robotsTxt.isAllowed("/forum"));
      assertFalse(robotsTxt.isAllowed("/forum/thread/42"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search"));
      assertFalse(robotsTxt.isAllowed("/search?q=answer"));
    }
  }

  public void testDisallowPartsForSome() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowPartsForSome.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertFalse(robotsTxt.isAllowed("/forum"));
      assertFalse(robotsTxt.isAllowed("/forum/thread/42"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search"));
      assertFalse(robotsTxt.isAllowed("/search?q=answer"));
    }
  }

  public void testDisallowPartsForSomeAsGoodbot() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowPartsForSome.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt("goodbot", stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/forum/thread/42"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
    }
  }

  public void testDisallowMultipleAsSmila() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowMultiple.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertFalse(robotsTxt.isAllowed("/forum"));
      assertFalse(robotsTxt.isAllowed("/forum/thread/42"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
    }
  }

  public void testDisallowMultipleAsBadbot1() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowMultiple.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt("badbot1", stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/forum/thread/42"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search"));
      assertFalse(robotsTxt.isAllowed("/search?q=answer"));
    }
  }

  public void testDisallowMultipleAsGoodbot() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowMultiple.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt("goodbot", stream);
      assertTrue(robotsTxt.isAllowed("/"));
      assertTrue(robotsTxt.isAllowed("/forum"));
      assertTrue(robotsTxt.isAllowed("/forum/thread/42"));
      assertTrue(robotsTxt.isAllowed("/index.html"));
      assertTrue(robotsTxt.isAllowed("/search"));
      assertTrue(robotsTxt.isAllowed("/search?q=answer"));
    }
  }

  public void testEncodedCharacters() throws Exception {
    try (InputStream stream = getRobotsTxtStream("encoded.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      assertTrue(robotsTxt.isAllowed("/~gandalf/spells"));
      assertTrue(robotsTxt.isAllowed("/%7Egandalf/"));
      assertTrue(robotsTxt.isAllowed("/%7egandalf"));
      assertFalse(robotsTxt.isAllowed("/~frodo/ring"));
      assertFalse(robotsTxt.isAllowed("/%7Efrodo/"));
      assertFalse(robotsTxt.isAllowed("/%7efrodo"));
      assertFalse(robotsTxt.isAllowed("/~bilbo/book"));
      assertFalse(robotsTxt.isAllowed("/%7Ebilbo/"));
      assertFalse(robotsTxt.isAllowed("/%7ebilbo"));
      assertFalse(robotsTxt.isAllowed("/~samweis/rope"));
      assertFalse(robotsTxt.isAllowed("/%7Esamweis/"));
      assertFalse(robotsTxt.isAllowed("/%7esamweis"));
    }
  }

  public void testSerialization() throws Exception {
    try (InputStream stream = getRobotsTxtStream("disallowMultiple.txt")) {
      final RobotsTxt robotsTxt = new RobotsTxt(WebCrawlerConstants.DEFAULT_USERAGENT, stream);
      final byte[] serialized = robotsTxt.asBinary();
      final RobotsTxt deserialized = new RobotsTxt(serialized);
      assertTrue(deserialized.isAllowed("/"));
      assertFalse(deserialized.isAllowed("/forum"));
      assertFalse(deserialized.isAllowed("/forum/thread/42"));
      assertTrue(deserialized.isAllowed("/index.html"));
      assertTrue(deserialized.isAllowed("/search"));
      assertTrue(deserialized.isAllowed("/search?q=answer"));

    }
  }

  public void testSerializationOfEmpty() throws Exception {
    final RobotsTxt robotsTxt = new RobotsTxt();
    final byte[] serialized = robotsTxt.asBinary();
    final RobotsTxt deserialized = new RobotsTxt(serialized);
    assertTrue(deserialized.isAllowed("/"));
    assertTrue(deserialized.isAllowed("/forum"));
    assertTrue(deserialized.isAllowed("/forum/thread/42"));
    assertTrue(deserialized.isAllowed("/index.html"));
    assertTrue(deserialized.isAllowed("/search"));
    assertTrue(deserialized.isAllowed("/search?q=answer"));
  }

  private InputStream getRobotsTxtStream(final String filename) {
    return ConfigUtils.getConfigStream(AllTests.BUNDLE_ID, "robotstxt/" + filename);
  }
}
