/*********************************************************************************************************************
 * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved.
 * This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 *********************************************************************************************************************/
package org.eclipse.smila.importing.crawler.web.utils;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;

import org.apache.commons.lang.StringUtils;
import org.apache.http.client.utils.URIUtils;

/** helper class for working with URLs. */
public final class UriHelper {
  private static final String SCHEME_HTTP = "http";

  private static final String SCHEME_HTTPS = "https";

  private static final int DEFAULT_HTTP_PORT = 80;

  private static final int DEFAULT_HTTPS_PORT = 443;

  /** private constructor to avoid instantiation. */
  private UriHelper() {
  }

  /** @return absolute URI from given URI by using base URI. The base URI is expected to be absolute! */
  public static String makeAbsolute(final String baseUriString, final String uriString) throws URISyntaxException {
    final URI uri = new URI(uriString);
    if (uri.isAbsolute()) {
      return uri.toString();
    }
    final URI baseUri = new URI(baseUriString);
    if (!baseUri.isAbsolute()) {
      return null;
    }
    final URI absUri = URIUtils.resolve(baseUri, uri);
    return absUri.toString();
  }

  /** @return host from given URL string. */
  public static String getHost(final String url) throws MalformedURLException {
    return new URL(url).getHost();
  }

  /** @return "host:port" from given URL string. */
  public static String getHostAndPort(final String urlString) throws MalformedURLException {
    return getHostAndPort(new URL(urlString));
  }

  /** @return "host:port" from given URL. */
  public static String getHostAndPort(final URL url) {
    return url.getHost() + ":" + url.getPort();
  }

  /**
   * @return domain from given host string. This is a simple implementation that just cuts off the first part of the
   *         host name, if at least two parts remain. This won't work for sth. like 'bbc.co.uk'. (A more sophisticated
   *         solution would have to use something like a TLD list, see https://wiki.mozilla.org/TLD_List)
   */
  public static String getDomain(final String host) {
    // Only cut off first part of host, if remaining string has two domain parts:
    // www.foo.com -> foo.com
    // foo.com -> foo.com
    final int firstDot = host.indexOf(".");
    final int lastDot = host.lastIndexOf(".");
    if (firstDot != lastDot) {
      return host.substring(firstDot + 1);
    }
    return host;
  }

  /**
   * URI normalization:
   * <ul>
   * <li>make relative URLs absolute wrt. to context (if given)</li>
   * <li>normalize path (host/path/../path2 -> host/path2)</li>
   * <li>convert scheme and host to lower case (HTTP -> http, WWW.Host.de -> www.host.de)</li>
   * <li>remove fragment parts (...#fragment)</li>
   * <li>remove default port 80 (host:80 -> host)</li>
   * <li>filter out 'opaque' URIs (e.g. mailto:...@...)</li>
   * </ul>
   * 
   * @return 'null' if urlString couldn't be normalized.
   */
  public static String normalizeUrl(final String context, final String urlString) throws URISyntaxException {
    final String urlWithoutFragment = removeFragment(urlString);
    final URI uri = createUri(context, urlWithoutFragment);

    // filter out opaque URIs, e.g.: javascript:void(0), mailto:...@...
    if (uri == null || uri.isOpaque()) {
      return null;
    }
    // normalize URI path (remove '/..' etc.)
    final URI u = uri.normalize();
    // extract all parts of URI and normalize them.
    // hierarchical URI: [scheme:][//[user-info@]host[:port]][path][?query][#fragment]
    String scheme = u.getScheme();
    if (scheme != null) {
      scheme = scheme.toLowerCase();
    }
    final String userInfo = u.getUserInfo();
    String host = u.getHost();
    if (host != null) {
      host = host.toLowerCase();
    }
    int port = u.getPort();
    if (SCHEME_HTTP.equals(scheme) && port == DEFAULT_HTTP_PORT) {
      port = -1;
    }
    if (SCHEME_HTTPS.equals(scheme) && port == DEFAULT_HTTPS_PORT) {
      port = -1;
    }
    String path = u.getPath(); // this will return unescaped path
    if (path.isEmpty()) {
      path = "/";
    }
    final String query = u.getQuery(); // this will return unescaped query
    // this will escape the URI (last parameter 'null' -> no fragments)
    final String normalizedUrl = new URI(scheme, userInfo, host, port, path, query, null).toString();
    return normalizedUrl;
  }

  private static String removeFragment(final String urlString) {
    final int fragmentStart = urlString.indexOf('#');
    if (fragmentStart > 0) {
      return urlString.substring(0, fragmentStart);
    }
    return urlString;
  }

  private static URI createUri(final String context, final String urlWithoutFragment) throws URISyntaxException {
    URL baseUrl = null;
    if (!StringUtils.isEmpty(context)) {
      try {
        baseUrl = new URL(context);
      } catch (final MalformedURLException ex) {
        ; // ignore
      }
    }
    URI uri = null;
    try {
      uri = new URI(urlWithoutFragment);
      if (!uri.isAbsolute() && baseUrl != null) {
        uri = URIUtils.resolve(baseUrl.toURI(), uri);
      }
    } catch (final URISyntaxException e) {
      // urlString may be not escaped, try second way to create URI object
      try {
        final URL url = new URL(baseUrl, urlWithoutFragment);
        // urlString
        uri = new URI(url.getProtocol(), url.getAuthority(), url.getPath(), url.getQuery(), null);
        // last parameter 'null' -> no fragments
      } catch (final MalformedURLException me) {
        ; // ignore.
      }
    }
    return uri;
  }
}
