/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;

import org.ccil.cowan.tagsoup.Parser;
import org.eclipse.smila.common.logging.MessageCollector;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
 * {@link LinkExtractorHtml} implementations using tagsoup.
 */
public class LinkExtractorHtmlSoup implements LinkExtractorHtml {

  @Override
  public Collection<String> extractLinks(final InputStream input, final AnyMap parameters,
    final MessageCollector messageCollector) throws Exception {
    final LinkContentHandler linkHandler = new LinkContentHandler();
    final XMLReader parser = new Parser();
    parser.setContentHandler(linkHandler);
    parser.setFeature(Parser.namespacesFeature, false);
    try {
      parser.parse(new InputSource(input));
    } catch (final IOException e) {
      messageCollector.add("Error during parsing of document: " + e.getMessage());
    }
    return linkHandler._links;
  }

  /** SAX parser extension for extracting links. */
  private static class LinkContentHandler extends DefaultHandler {
    /** extracted links. */
    private final Collection<String> _links = new ArrayList<String>();

    /** for handling the html "base" tag. */
    private String _base;

    @Override
    public void startElement(final String uri, final String localName, final String qName, final Attributes atts)
      throws SAXException {
      String link = null;
      if (qName.equalsIgnoreCase("a")) {
        link = atts.getValue("href");
      } else if (qName.equalsIgnoreCase("frame") || qName.equalsIgnoreCase("img")) {
        link = atts.getValue("src");
      } else if (qName.equalsIgnoreCase("base")) {
        _base = atts.getValue("href");
      }
      if (link != null && !link.isEmpty()) {
        try {
          if (_base != null && !(new URI(link).isAbsolute())) {
            link = UriHelper.makeAbsolute(_base, link);
          }
          _links.add(link);
        } catch (final URISyntaxException e) {
          ;// wrong URI syntax -> ignore link
        }
      }
      super.startElement(uri, localName, qName, atts);
    }
  }
}
