/*******************************************************************************
 * Copyright (c) 2008, 2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Empolis Information GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

/**
 * parses a robots.txt file and checks if paths are allowed.
 * 
 * See [1] http://www.robotstxt.org/norobots-rfc.txt for syntax definition
 */
public class RobotsTxt {

  private static final String TAG_AGENTLINE = "User-agent:";

  private static final String TAG_DISALLOWLINE = "Disallow:";

  private static final String ENCODING = "utf-8";

  private static final String SEPARATOR = "##";

  private final List<String> _disallows = new ArrayList<>();

  /** create empty instance allowing everything. */
  public RobotsTxt() {
    this(false);
  }

  /** create empty instance allowing or forbidding everything. */
  public RobotsTxt(final boolean disallowAll) {
    if (disallowAll) {
      _disallows.add("/");
    }
  }

  /**
   * create instance by reading "Disallow" entries from stream that apply to the given user-agent.
   * 
   * @throws IOException
   *           error reading the stream
   */
  public RobotsTxt(final String userAgent, final InputStream stream) throws IOException {
    final BufferedReader reader = new BufferedReader(new InputStreamReader(stream, Charset.forName(ENCODING)));
    String line;
    boolean foundAgentLine = false;
    boolean foundDisallowLines = false;
    while ((line = reader.readLine()) != null) {
      if (line.startsWith(TAG_AGENTLINE)) {
        if (foundDisallowLines) {
          return; // record concerning our agent finished, we can stop parsing.
        }
        if (!foundAgentLine) {
          final String agent = getLineValue(line, TAG_AGENTLINE.length());
          foundAgentLine = agent.equals("*") || userAgent.toLowerCase().contains(agent.toLowerCase());
        }
      } else if (foundAgentLine && line.startsWith(TAG_DISALLOWLINE)) {
        final String disallowPath = getLineValue(line, TAG_DISALLOWLINE.length());
        if (!disallowPath.isEmpty()) {
          _disallows.add(URLDecoder.decode(disallowPath, ENCODING));
        }
        foundDisallowLines = true;
      }
    }
  }

  /** create instance from serialized version, as created by {@link #asBinary()}. */
  public RobotsTxt(final byte[] serialized) {
    final String concatenatedDisallows = new String(serialized, Charset.forName(ENCODING));
    final String[] split = concatenatedDisallows.split(SEPARATOR);
    addDisallows(split);
  }

  /** create instance from given paths, used for tests. */
  public RobotsTxt(final String... disallows) {
    addDisallows(disallows);
  }

  private void addDisallows(final String... disallows) {
    for (final String disallow : disallows) {
      if (!disallow.isEmpty()) {
        _disallows.add(disallow);
      }
    }
  }

  /**
   * get value part of a line after the tag, strip trailing comments and whitespace.
   * 
   * @param offset
   *          length of the tag the line starts with.
   */
  private String getLineValue(final String line, final int offset) {
    String value = line.substring(offset);
    final int commentStart = value.indexOf('#'); // [1] allows comments after path
    if (commentStart > 0) {
      value = value.substring(0, commentStart);
    }
    return value.trim();
  }

  /**
   * check if the path is allowed for the user agent in the parsed robots.txt file.
   * 
   * @param path
   *          path portion or the URL to check.
   * @return true if the user agent is allowed to see this path.
   */
  public boolean isAllowed(final String path) {
    try {
      String urlDecodedPath = URLDecoder.decode(path, ENCODING);
      if (!urlDecodedPath.startsWith("/")) {
        urlDecodedPath = "/" + urlDecodedPath;
      }
      for (final String disallow : _disallows) {
        if (urlDecodedPath.startsWith(disallow)) {
          return false;
        }
      }
      return true;
    } catch (final UnsupportedEncodingException ex) {
      throw new IllegalArgumentException(ex);
    }
  }

  /** serialized to byte array for cluster serialization. */
  public byte[] asBinary() {
    final StringBuilder builder = new StringBuilder();
    for (final String disallow : _disallows) {
      builder.append(SEPARATOR).append(disallow);
    }
    return builder.toString().getBytes(Charset.forName(ENCODING));
  }

}
