/*******************************************************************************
 * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
 * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
 * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Andreas Weber (Attensity Europe GmbH) - initial API and implementation
 *******************************************************************************/
package org.eclipse.smila.importing.crawler.web.extractor;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.filters.DefaultFilter;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.importing.util.MessageCollector;

/**
 * {@link LinkExtractorHtml} implementations using nekohtml.
 */
public class LinkExtractorHtmlNeko implements LinkExtractorHtml {

  @Override
  public Collection<String> extractLinks(final InputStream input, final AnyMap parameters,
    final MessageCollector messageCollector) throws Exception {
    final LinkFilter linkFilter = new LinkFilter();
    final XMLParserConfiguration parser = new HTMLConfiguration();
    parser.setDocumentHandler(linkFilter);
    final XMLInputSource source = new XMLInputSource(null, null, null, input, null);
    parser.parse(source);
    return linkFilter._links;
  }

  /** DOM parser extension for extracting links. */
  private static class LinkFilter extends DefaultFilter {
    /** extracted links. */
    private final Collection<String> _links = new ArrayList<String>();

    @Override
    public void startElement(final QName element, final XMLAttributes attrs, final Augmentations augs) {
      String link = null;
      if (element.rawname.equalsIgnoreCase("a")) {
        link = attrs.getValue("href");
      } else if (element.rawname.equalsIgnoreCase("frame") || element.rawname.equalsIgnoreCase("img")) {
        link = attrs.getValue("src");
      }
      if (link != null) {
        _links.add(link);
      }
      super.startElement(element, attrs, augs);
    }
  }
}
