/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/

package org.eclipse.smila.importing.crawler.web;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/** constants used by web crawler and subcomponents: attribute and attachment names, task parameters. */
public final class WebCrawlerConstants {
  /** name of attribute containing the URL of the web resource. */
  public static final String ATTRIBUTE_URL = "httpUrl";

  /** name of attribute containing the last-modified header reported by the web server (if any). */
  public static final String ATTRIBUTE_LASTMODIFIED = "httpLastModified";

  /** name of attribute containing the content-type of the web resource reported by the web server (if any). */
  public static final String ATTRIBUTE_CONTENTTYPE = "httpContenttype";

  /** name of attribute containing the mimetype of the web resource reported by the web server. (if any). */
  public static final String ATTRIBUTE_MIMETYPE = "httpMimetype";

  /** name of attribute containing the charset of the web resource reported by the web server (if any). */
  public static final String ATTRIBUTE_CHARSET = "httpCharset";

  /** name of attribute containing the content-length of the web resource reported by the web server (if any). */
  public static final String ATTRIBUTE_SIZE = "httpSize";

  /** name of attachment containing the content of a web resource. */
  public static final String ATTACHMENT_CONTENT = "httpContent";

  /** internal attribute used to apply max crawl depth. */
  public static final String ATTRIBUTE_CRAWL_DEPTH = "_crawlDepth";

  /** Name of the task parameter that contains the start URL for crawling. */
  public static final String TASK_PARAM_START_URL = "startUrl";

  /**
   * Name of the task parameter that contains a long value in milliseconds on how long to wait between http requests.
   */
  public static final String TASK_PARAM_WAIT_BETWEEN_REQUESTS = "waitBetweenRequests";

  /** Name of the task parameter that contains the number of links to write to one bulk object. */
  public static final String TASK_PARAM_LINKS_PER_BULK = "linksPerBulk";

  /** default value for 'linksPerBulk' parameter. */
  public static final int DEFAULT_LINKS_PER_BULK = 10;

  /** default user agent, if nothing valid is defined in webcrawler.properties. */
  public static final String DEFAULT_USERAGENT =
    "SMILA (http://wiki.eclipse.org/SMILA/UserAgent; smila-dev@eclipse.org)";

  /** Name of the task parameter that tells how to handle links that cannot be fetched. */
  public static final String TASK_PARAM_LINK_ERROR_HANDLING = "linkErrorHandling";

  /** what to do on IO errors when fetching links. */
  public enum ErrorHandling {
    /** finish task as recoverable so that it can be retried. */
    RETRY,
    /** drop record. */
    DROP
  }

  /** the property names the web ETL workers should support for mapping. */
  public static final Set<String> PROPERTY_NAMES;

  static {
    final Set<String> properties = new HashSet<String>();
    properties.add(ATTACHMENT_CONTENT);
    properties.add(ATTRIBUTE_CHARSET);
    properties.add(ATTRIBUTE_CONTENTTYPE);
    properties.add(ATTRIBUTE_LASTMODIFIED);
    properties.add(ATTRIBUTE_MIMETYPE);
    properties.add(ATTRIBUTE_SIZE);
    properties.add(ATTRIBUTE_URL);
    PROPERTY_NAMES = Collections.unmodifiableSet(properties);
  }

  /** don't create instances. */
  private WebCrawlerConstants() {
    throw new UnsupportedOperationException();
  }
}
