package org.eclipse.smila.importing.crawler.web.utils;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;

import org.apache.http.client.utils.URIUtils;

/** helper class for working with URLs. */
public class UriHelper {

  /** @return absolute URI from given URI by using base URI. The base URI is expected to be absolute! */
  public static String makeAbsolute(final String baseUriString, final String uriString) throws URISyntaxException {
    final URI uri = new URI(uriString);
    if (uri.isAbsolute()) {
      return uri.toString();
    }
    final URI baseUri = new URI(baseUriString);
    if (!baseUri.isAbsolute()) {
      return null;
    }
    final URI absUri = URIUtils.resolve(baseUri, uri);
    return absUri.toString();
  }

  /**
   * URI normalization:
   * <ul>
   * <li>normalize path (host/path/../path2 -> host/path2)</li>
   * <li>convert scheme and host to lower case (HTTP -> http, WWW.Host.de -> www.host.de)</li>
   * <li>remove fragment parts (...#fragment)</li>
   * <li>remove default port 80 (host:80 -> host)</li>
   * <li>filter out 'opaque' URIs (e.g. mailto:...@...)</li>
   * </ul>
   * 
   * @return 'null' if urlString couldn't be normalized.
   */
  public static String normalizeUrl(final String urlString) throws URISyntaxException {
    URI uri = null;
    try {
      uri = new URI(urlString);
    } catch (URISyntaxException e) {
      // urlString may be not escaped, try second way to create URI object
      try {
        final URL url = new URL(urlString); // works for unescaped urlString, doesn't work for relative urlString
        uri = new URI(url.getProtocol(), url.getAuthority(), url.getPath(), url.getQuery(), null); // null -> no fragments
      } catch (MalformedURLException me) {
        return null;
      }
    }

    // filter out opaque URIs, e.g.: javascript:void(0), mailto:...@...
    if (uri.isOpaque()) {
      return null;
    }

    // normalize URI path (remove '/..' etc.)
    URI u = uri.normalize();

    // extract all parts of URI and normalize them.
    // hierarchical URI: [scheme:][//[user-info@]host[:port]][path][?query][#fragment]
    String scheme = u.getScheme();
    if (scheme != null) {
      scheme = scheme.toLowerCase();
    }
    final String userInfo = u.getUserInfo();
    String host = u.getHost();
    if (host != null) {
      host = host.toLowerCase();
    }
    int port = u.getPort();
    if (port == 80) {
      port = -1;
    }
    final String path = u.getPath(); // this will return unescaped path    
    final String query = u.getQuery(); // this will return unescaped query    

    // this will escape the URI
    String normalizedUrl = new URI(scheme, userInfo, host, port, path, query, null).toString(); // null -> no fragments
    return normalizedUrl;
  }
}
