/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.importing.crawler.web.filter;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;

public class DefaultLinkFilter
implements LinkFilter {
    private JobRunDataProvider _jobRunDataProvider;
    private final Log _log = LogFactory.getLog(this.getClass());

    @Override
    public Collection<Record> filterExtractedLinks(Collection<Record> extractedLinks, String sourceUrl, WebCrawlingContext context) throws WebCrawlerException {
        HashSet<String> links = new HashSet<String>();
        ArrayList<Record> filteredLinks = new ArrayList<Record>(extractedLinks.size());
        try {
            String sourceHost = UriHelper.getHost(sourceUrl);
            String sourceDomain = UriHelper.getDomain(sourceHost);
            for (Record link : extractedLinks) {
                try {
                    String urlString = link.getMetadata().getStringValue("httpUrl");
                    URL url = new URL(urlString);
                    if (!this.checkStayOn(url, sourceHost, sourceDomain, context) || !this.allowedByRobotsTxt(url, context) || !this.isLinkAllowed(context.getFilterConfiguration(), urlString, links)) continue;
                    filteredLinks.add(link);
                }
                catch (MalformedURLException malformedURLException) {}
            }
        }
        catch (MalformedURLException malformedURLException) {}
        return filteredLinks;
    }

    private boolean allowedByRobotsTxt(URL url, WebCrawlingContext context) throws MalformedURLException {
        boolean allowed;
        String hostAndPort = UriHelper.getHostAndPort(url);
        RobotsTxt robotsTxt = context.getRobotsTxt(hostAndPort, this._jobRunDataProvider);
        boolean bl = allowed = robotsTxt == null || robotsTxt.isAllowed(url.getFile());
        if (!allowed) {
            this._log.info((Object)("URL " + url + " not allowed by robots.txt"));
        }
        return allowed;
    }

    private boolean isLinkAllowed(FilterConfiguration filterConfig, String url, Set<String> links) {
        if (!links.add(url)) {
            return false;
        }
        return filterConfig == null || filterConfig.getUrlPatternMatcher().matches(url);
    }

    @Override
    public boolean allowRedirectLink(String url, String originalUrl, WebCrawlingContext context) throws WebCrawlerException {
        FilterConfiguration filterConfig;
        if (url != null && (filterConfig = context.getFilterConfiguration()) != null) {
            try {
                String sourceHost = UriHelper.getHost(originalUrl);
                String sourceDomain = UriHelper.getDomain(sourceHost);
                if (this.checkStayOn(url, sourceHost, sourceDomain, context) && filterConfig.getUrlPatternMatcher().matches(url)) {
                    return true;
                }
            }
            catch (MalformedURLException malformedURLException) {
                return false;
            }
        }
        return false;
    }

    private boolean checkStayOn(String url, String currentHost, String currentDomain, WebCrawlingContext context) {
        try {
            return this.checkStayOn(new URL(url), currentHost, currentDomain, context);
        }
        catch (MalformedURLException malformedURLException) {
            return false;
        }
    }

    private boolean checkStayOn(URL url, String currentHost, String currentDomain, WebCrawlingContext context) {
        String domain;
        String host;
        return !(context.getFilterConfiguration().isStayOnHost() ? !currentHost.equals(host = url.getHost()) : context.getFilterConfiguration().isStayOnDomain() && !currentDomain.equals(domain = UriHelper.getDomain(host = url.getHost())));
    }

    public void setJobRunDataProvider(JobRunDataProvider jobRunDataProvider) {
        this._jobRunDataProvider = jobRunDataProvider;
    }

    public void unsetJobRunDataProvider(JobRunDataProvider jobRunDataProvider) {
        if (this._jobRunDataProvider == jobRunDataProvider) {
            this._jobRunDataProvider = null;
        }
    }
}

