/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.extractor;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;

import org.ccil.cowan.tagsoup.Parser;
import org.eclipse.smila.datamodel.AnyMap;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
 * {@link LinkExtractorHtml} implementations using tagsoup.
 */
public class LinkExtractorHtmlSoup implements LinkExtractorHtml {

  @Override
  public Collection<String> extractLinks(final InputStream input, final AnyMap parameters) throws Exception {
    final LinkContentHandler linkHandler = new LinkContentHandler();
    final XMLReader parser = new Parser();
    parser.setContentHandler(linkHandler);
    parser.setFeature(Parser.namespacesFeature, false);
    parser.parse(new InputSource(input));
    return linkHandler._links;
  }

  /** SAX parser extension for extracting links. */
  private static class LinkContentHandler extends DefaultHandler {
    /** extracted links. */
    private final Collection<String> _links = new ArrayList<String>();

    @Override
    public void startElement(final String uri, final String localName, final String qName, final Attributes atts)
      throws SAXException {
      // System.out.println(qName);
      String link = null;
      if (qName.equalsIgnoreCase("a")) {
        link = atts.getValue("href");
      } else if (qName.equalsIgnoreCase("frame") || qName.equalsIgnoreCase("img")) {
        link = atts.getValue("src");
      }
      if (link != null) {
        _links.add(link);
      }
      super.startElement(uri, localName, qName, atts);
    }

    @Override
    public void endElement(final String uri, final String localName, final String qName) throws SAXException {
      // System.out.println("/" + qName);
      super.endElement(uri, localName, qName);
    }

  }
}
