/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.importing.crawler.web.filter;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.taskworker.TaskLog;

public class SimpleLinkFilter
implements LinkFilter {
    @Override
    public Collection<Record> filterLinks(Collection<Record> extractedLinks, Record sourceLink, AnyMap parameters, TaskLog taskLog) throws WebCrawlerException {
        HashSet<String> links = new HashSet<String>();
        ArrayList<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size());
        FilterConfiguration configuration = new FilterConfiguration(parameters, taskLog);
        for (Record link : extractedLinks) {
            String url = this.normalizeUrl(link);
            if (!this.isLinkAllowed(url, links, configuration, taskLog)) continue;
            filteredLinks.add(link);
        }
        return filteredLinks;
    }

    private String normalizeUrl(Record link) {
        String url = link.getMetadata().getStringValue("http.url");
        int hashIndex = url.indexOf(35);
        if (hashIndex > 0) {
            String baseUrl = url.substring(0, hashIndex);
            link.getMetadata().put("http.url", baseUrl);
            return baseUrl;
        }
        return url;
    }

    private boolean isLinkAllowed(String url, Set<String> links, FilterConfiguration configuration, TaskLog log) {
        if (links.add(url)) {
            if (url.indexOf(63) > 0) {
                return false;
            }
            if (configuration.getUrlPrefix() != null && !url.startsWith(configuration.getUrlPrefix())) {
                return false;
            }
            String linkHost = SimpleLinkFilter.getHost(url, log);
            return linkHost != null && linkHost.equalsIgnoreCase(configuration.getStartHost());
        }
        return false;
    }

    protected static String getHost(String urlString, TaskLog log) {
        try {
            URL url = new URL(urlString);
            return url.getHost();
        }
        catch (MalformedURLException ex) {
            log.info("Failed to extract hostname from " + urlString + ": " + ex.toString());
            return null;
        }
    }
}

