/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.importing.crawler.web.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.ProxySelector;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Date;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.RedirectException;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.params.HttpClientParams;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.impl.client.AbstractHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
import org.apache.http.impl.cookie.DateParseException;
import org.apache.http.impl.cookie.DateUtils;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.http.client.util.HttpClientUtil;
import org.eclipse.smila.importing.ImportingException;
import org.eclipse.smila.importing.VisitedLinksException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.fetcher.HttpResponseInputStream;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.RobotsTxt;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.importing.crawler.web.utils.WebCrawlerConfiguration;
import org.eclipse.smila.importing.util.PropertyNameMapper;
import org.eclipse.smila.jobmanager.JobRunDataProvider;
import org.eclipse.smila.taskworker.TaskContext;

public class DefaultFetcher
implements Fetcher {
    private static final String HEADER_LASTMODIFIED = "Last-Modified";
    private static final String HEADER_CONTENTTYPE = "Content-Type";
    private static final String HEADER_PARAM_CHARSET = "charset";
    private static final int DEFAULT_MAX_CONNECTIONS_PER_HOST = 32;
    private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 128;
    private VisitedLinksService _visitedLinks;
    private LinkFilter _linkFilter;
    private JobRunDataProvider _jobRunDataProvider;
    private final HttpClient _httpClient;
    private final WebCrawlerConfiguration _configuration;
    private final Log _log = LogFactory.getLog(this.getClass());

    public DefaultFetcher() {
        this._configuration = new WebCrawlerConfiguration();
        this._httpClient = this.createAndConfigureClient();
    }

    private HttpClient createAndConfigureClient() {
        ClientConnectionManager connectionManager = HttpClientUtil.createThreadSafeConnectionManager((int)128, (int)32);
        DefaultHttpClient httpClient = new DefaultHttpClient(connectionManager);
        HttpParams params = httpClient.getParams();
        HttpClientParams.setRedirecting((HttpParams)params, (boolean)false);
        HttpProtocolParams.setUserAgent((HttpParams)params, (String)this._configuration.getUserAgent());
        HttpHost proxyHost = this._configuration.getProxyHost();
        if (proxyHost != null) {
            ConnRouteParams.setDefaultProxy((HttpParams)params, (HttpHost)proxyHost);
        } else {
            ((AbstractHttpClient)httpClient).setRoutePlanner((HttpRoutePlanner)new ProxySelectorRoutePlanner(httpClient.getConnectionManager().getSchemeRegistry(), ProxySelector.getDefault()));
        }
        Integer socketTimeout = this._configuration.getSocketTimeout();
        if (socketTimeout > 0) {
            HttpConnectionParams.setSoTimeout((HttpParams)params, (int)socketTimeout);
        }
        return httpClient;
    }

    @Override
    public void crawl(String url, Record linkRecord, WebCrawlingContext context) throws WebCrawlerException {
        try {
            Throwable throwable = null;
            Object var5_10 = null;
            try (HttpResponseInputStream response = this.getResource(url, context, true, true);){
                this.resetUrlAttributeOnRedirect(linkRecord, response, context.getMapper());
                this.readMetadata(linkRecord, response);
                this.readHtmlContent(linkRecord, response);
            }
            catch (Throwable throwable2) {
                if (throwable == null) {
                    throwable = throwable2;
                } else if (throwable != throwable2) {
                    throwable.addSuppressed(throwable2);
                }
                throw throwable;
            }
        }
        catch (RedirectException ex) {
            throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " + ex.getMessage(), ex, false);
        }
        catch (VisitedLinksException ex) {
            throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " + ex.getMessage(), ex);
        }
        catch (IOException ex) {
            throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex);
        }
        catch (RuntimeException ex) {
            throw new WebCrawlerException("Error while getting web resource " + url + ": " + ex.getMessage(), ex, false);
        }
    }

    @Override
    public void fetch(String url, Record crawledRecord, WebCrawlingContext context) throws WebCrawlerException {
        try {
            Throwable throwable = null;
            Object var5_10 = null;
            try (HttpResponseInputStream response = this.getResource(url, context, false, false);){
                this.resetUrlAttributeOnRedirect(crawledRecord, response, context.getMapper());
                this.readMetadata(crawledRecord, response);
                this.readContent(crawledRecord, response);
            }
            catch (Throwable throwable2) {
                if (throwable == null) {
                    throwable = throwable2;
                } else if (throwable != throwable2) {
                    throwable.addSuppressed(throwable2);
                }
                throw throwable;
            }
        }
        catch (RedirectException ex) {
            throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " + ex.getMessage(), ex, false);
        }
        catch (VisitedLinksException ex) {
            throw new WebCrawlerException("Error while handling redirects for web resource " + url + ": " + ex.getMessage(), ex);
        }
        catch (IOException ex) {
            throw new WebCrawlerException("IO error while getting web resource " + url + ": " + ex.getMessage(), ex);
        }
        catch (RuntimeException ex) {
            throw new WebCrawlerException("Error while getting web resource " + url + ": " + ex.getMessage(), ex, false);
        }
    }

    public InputStream getContent(Record crawledRecord, TaskContext taskContext) throws ImportingException {
        PropertyNameMapper mapper = PropertyNameMapper.createFrom((TaskContext)taskContext);
        String url = crawledRecord.getMetadata().getStringValue((String)mapper.get("httpUrl").get(0));
        if (url == null) {
            url = crawledRecord.getMetadata().getStringValue("httpUrl");
        }
        try {
            HttpResponseInputStream response = this.getResource(url, new WebCrawlingContext(taskContext), false, false);
            this.resetUrlAttributeOnRedirect(crawledRecord, response, mapper);
            return response;
        }
        catch (RedirectException ex) {
            throw new ImportingException("Error while handling redirects for web resource " + url + ": " + ex.getMessage(), (Throwable)ex, false);
        }
        catch (VisitedLinksException ex) {
            throw new ImportingException("Error while handling redirects for web resource " + url + ": " + ex.getMessage(), (Throwable)ex, true);
        }
        catch (IOException ex) {
            throw new ImportingException("IO error while getting web resource " + url + ": " + ex.getMessage(), (Throwable)ex, true);
        }
        catch (Exception ex) {
            throw new ImportingException("Http error while getting web resource " + url + ": " + ex.getMessage(), (Throwable)ex, false);
        }
    }

    private HttpResponseInputStream getResource(String url, WebCrawlingContext context, boolean checkRobotsTxt, boolean checkVisitedLinksInRedirects) throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
        return this.getResource(url, context, checkRobotsTxt, checkVisitedLinksInRedirects, 0);
    }

    private HttpResponseInputStream getResource(String url, WebCrawlingContext context, boolean checkRobotsTxt, boolean checkVisitedLinksInRedirects, int redirectLevel) throws WebCrawlerException, VisitedLinksException, RedirectException, IOException {
        if (checkRobotsTxt && this.disallowedByRobotsTxt(url, context)) {
            throw new WebCrawlerException("Crawling " + url + " is not allowed by robots.txt", false);
        }
        FilterConfiguration filterConfig = context.getFilterConfiguration();
        HttpGet request = new HttpGet(url);
        HttpResponse response = this._httpClient.execute((HttpUriRequest)request);
        HttpResponseInputStream responseStream = new HttpResponseInputStream(url, response, redirectLevel > 0);
        int responseCode = response.getStatusLine().getStatusCode();
        if (responseCode == 200) {
            return responseStream;
        }
        if (this.isRedirect(responseCode)) {
            if (filterConfig != null && filterConfig.followRedirects()) {
                return this.handleRedirects(responseStream, context, checkRobotsTxt, checkVisitedLinksInRedirects, redirectLevel, url);
            }
            IOUtils.closeQuietly((InputStream)responseStream);
            throw new RedirectException("Follow redirects not configured, skipping link " + url);
        }
        IOUtils.closeQuietly((InputStream)responseStream);
        throw new WebCrawlerException("GET " + url + ": server responded with " + responseCode + ".");
    }

    private boolean isRedirect(int statusCode) {
        switch (statusCode) {
            case 301: 
            case 302: 
            case 303: 
            case 307: {
                return true;
            }
        }
        return false;
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    private HttpResponseInputStream handleRedirects(HttpResponseInputStream responseStream, WebCrawlingContext context, boolean checkRobotsTxt, boolean checkVisitedLinksInRedirects, int redirectLevel, String originalUrl) throws WebCrawlerException, VisitedLinksException, IOException, RedirectException {
        try {
            String redirectUrl;
            block10: {
                if ((long)redirectLevel >= context.getFilterConfiguration().getMaxRedirects()) {
                    throw new RedirectException("Reached maximum number of redirects");
                }
                HttpResponse response = responseStream.getResponse();
                Header locationHeader = response.getFirstHeader("location");
                if (locationHeader == null) {
                    throw new RedirectException("Received redirect response " + response.getStatusLine() + " but no location header");
                }
                String location = locationHeader.getValue();
                try {
                    redirectUrl = UriHelper.normalizeUrl(responseStream.getUrl(), location);
                    if (redirectUrl == null) {
                        throw new RedirectException("Couldn't create absolute link from baseUri " + responseStream.getUrl() + " and link " + location);
                    }
                    if (!this._linkFilter.allowRedirectLink(redirectUrl, originalUrl, context)) throw new RedirectException("Redirect to URL '" + redirectUrl + "' is not allowed by filter configuration");
                    if (checkVisitedLinksInRedirects && this.checkIfLinkIsVisited(context, redirectUrl)) break block10;
                    context.getVisitedUrls().add(redirectUrl);
                    HttpResponseInputStream httpResponseInputStream = this.getResource(redirectUrl, context, checkRobotsTxt, checkVisitedLinksInRedirects, ++redirectLevel);
                    return httpResponseInputStream;
                }
                catch (URISyntaxException ex) {
                    throw new RedirectException("Invalid Redirect location '" + location + "'", (Throwable)ex);
                }
            }
            throw new RedirectException("Redirect to URL '" + redirectUrl + "' is not allowed. URL was already visited");
        }
        finally {
            IOUtils.closeQuietly((InputStream)responseStream);
        }
    }

    private boolean checkIfLinkIsVisited(WebCrawlingContext context, String redirectUrl) throws VisitedLinksException {
        return context.getVisitedUrls().contains(redirectUrl) || this._visitedLinks.checkAndMarkVisited(context.getDataSource(), redirectUrl, context.getJobRunId(), context.getCurrentInputBulkId());
    }

    private void resetUrlAttributeOnRedirect(Record record, HttpResponseInputStream response, PropertyNameMapper mapper) {
        if (response.isRedirect()) {
            for (String attributeName : mapper.get("httpUrl")) {
                if (!record.getMetadata().containsKey((Object)attributeName)) continue;
                record.getMetadata().put(attributeName, response.getUrl());
            }
            if (record.getMetadata().containsKey((Object)"httpUrl")) {
                record.getMetadata().put("httpUrl", response.getUrl());
            }
        }
    }

    private void readMetadata(Record record, HttpResponseInputStream response) {
        Header date;
        Header contentType;
        AnyMap metadata = record.getMetadata();
        HttpEntity entity = response.getResponseEntity();
        if (entity != null && metadata.get((Object)"httpSize") == null) {
            metadata.put("httpSize", (Number)entity.getContentLength());
        }
        if ((contentType = response.getResponse().getFirstHeader(HEADER_CONTENTTYPE)) != null && metadata.get((Object)"httpContenttype") == null) {
            metadata.put("httpContenttype", contentType.getValue());
            HeaderElement[] elements = contentType.getElements();
            if (elements.length > 0) {
                NameValuePair charset;
                String mimetype = elements[0].getName();
                if (mimetype != null && metadata.get((Object)"httpMimetype") == null) {
                    metadata.put("httpMimetype", mimetype);
                }
                if ((charset = elements[0].getParameterByName(HEADER_PARAM_CHARSET)) != null && metadata.get((Object)"httpCharset") == null) {
                    metadata.put("httpCharset", charset.getValue());
                }
            }
        }
        if ((date = response.getResponse().getFirstHeader(HEADER_LASTMODIFIED)) != null) {
            try {
                Date parsedDate = DateUtils.parseDate((String)date.getValue());
                if (parsedDate != null && metadata.get((Object)"httpLastModified") == null) {
                    metadata.put("httpLastModified", (Any)metadata.getFactory().createDateTimeValue(parsedDate));
                }
            }
            catch (DateParseException dateParseException) {
                metadata.put("httpLastModified", date.getValue());
            }
        }
    }

    private void readHtmlContent(Record record, InputStream contentStream) throws IOException {
        String mimetype = record.getMetadata().getStringValue("httpMimetype");
        if (mimetype != null && mimetype.equals("text/html")) {
            this.readContent(record, contentStream);
        }
    }

    private void readContent(Record record, InputStream contentStream) throws IOException {
        byte[] content = IOUtils.toByteArray((InputStream)contentStream);
        if (content != null) {
            record.setAttachment("httpContent", content);
            Long size = record.getMetadata().getLongValue("httpSize");
            if (size == null || size < 0L) {
                record.getMetadata().put("httpSize", (Number)content.length);
            }
        }
    }

    private boolean disallowedByRobotsTxt(String recordUrl, WebCrawlingContext context) throws WebCrawlerException {
        try {
            URL url = new URL(recordUrl);
            RobotsTxt hostRobotsTxt = this.getRobotsTxt(url, context);
            return hostRobotsTxt != null && !hostRobotsTxt.isAllowed(url.getFile());
        }
        catch (MalformedURLException ex) {
            throw new WebCrawlerException(ex, false);
        }
    }

    private RobotsTxt getRobotsTxt(URL url, WebCrawlingContext context) throws WebCrawlerException {
        String protocol = url.getProtocol();
        String hostAndPort = UriHelper.getHostAndPort(url);
        RobotsTxt robotsTxt = context.getRobotsTxt(hostAndPort, this._jobRunDataProvider);
        if (robotsTxt == null) {
            robotsTxt = this.fetchRobotsTxt(protocol, hostAndPort);
            context.putRobotsTxt(hostAndPort, robotsTxt, this._jobRunDataProvider);
        }
        return robotsTxt;
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    private RobotsTxt fetchRobotsTxt(String protocol, String hostAndPort) throws WebCrawlerException {
        this._log.info((Object)("Fetching robots.txt for " + hostAndPort));
        String robotsUrl = String.valueOf(protocol) + "://" + hostAndPort + "/robots.txt";
        try {
            Throwable throwable = null;
            Object var5_7 = null;
            try {
                RobotsTxt robotsTxt;
                HttpResponseInputStream robotsTxtStream = this.fetchRobotsTxtStream(robotsUrl);
                try {
                    int responseCode = robotsTxtStream.getResponse().getStatusLine().getStatusCode();
                    if (responseCode >= 200 && responseCode <= 300) {
                        robotsTxt = new RobotsTxt(this._configuration.getUserAgent(), robotsTxtStream);
                        return robotsTxt;
                    }
                    if (responseCode == 404) return new RobotsTxt(false);
                    if (responseCode == 410) {
                        return new RobotsTxt(false);
                    }
                    if (responseCode >= 500) {
                        throw new IOException("Failed to accesss robots.txt for " + hostAndPort + ", server responded with status " + responseCode + ", condition may be temporarily, will retry later.");
                    }
                    return new RobotsTxt(true);
                }
                catch (Throwable throwable2) {
                    throw throwable2;
                }
                finally {
                    if (robotsTxtStream == null) return robotsTxt;
                    robotsTxtStream.close();
                }
            }
            catch (Throwable throwable3) {
                if (throwable == null) {
                    throwable = throwable3;
                    throw throwable;
                }
                if (throwable == throwable3) throw throwable;
                throwable.addSuppressed(throwable3);
                throw throwable;
            }
        }
        catch (IOException ex) {
            throw new WebCrawlerException(ex, true);
        }
        catch (WebCrawlerException webCrawlerException) {
            return new RobotsTxt(false);
        }
    }

    private HttpResponseInputStream fetchRobotsTxtStream(String robotsUrl) throws WebCrawlerException, IOException {
        String currentUrl = robotsUrl;
        for (int redirectCount = 0; redirectCount < 10; ++redirectCount) {
            this._log.info((Object)("Fetching robots.txt at " + currentUrl));
            HttpGet request = new HttpGet(currentUrl);
            HttpResponse response = this._httpClient.execute((HttpUriRequest)request);
            HttpResponseInputStream responseStream = new HttpResponseInputStream(currentUrl, response, false);
            int responseCode = response.getStatusLine().getStatusCode();
            if (this.isRedirect(responseCode)) {
                IOUtils.closeQuietly((InputStream)responseStream);
                Header locationHeader = response.getFirstHeader("location");
                if (locationHeader == null) {
                    throw new WebCrawlerException("Received redirect response " + response.getStatusLine() + " but no location header set.");
                }
                currentUrl = locationHeader.getValue();
                this._log.info((Object)("... redirected to " + currentUrl));
                continue;
            }
            return responseStream;
        }
        throw new WebCrawlerException("Got more than 10 redirects for fetching " + robotsUrl + ", giving up.", false);
    }

    public void setVisitedLinks(VisitedLinksService visitedLinks) {
        this._visitedLinks = visitedLinks;
    }

    public void unsetVisitedLinks(VisitedLinksService visitedLinks) {
        if (this._visitedLinks == visitedLinks) {
            this._visitedLinks = null;
        }
    }

    public void setLinkFilter(LinkFilter linkFilter) {
        this._linkFilter = linkFilter;
    }

    public void unsetLinkFilter(LinkFilter linkFilter) {
        if (this._linkFilter == linkFilter) {
            this._linkFilter = null;
        }
    }

    public void setJobRunDataProvider(JobRunDataProvider jobRunDataProvider) {
        this._jobRunDataProvider = jobRunDataProvider;
    }

    public void unsetJobRunDataProvider(JobRunDataProvider jobRunDataProvider) {
        if (this._jobRunDataProvider == jobRunDataProvider) {
            this._jobRunDataProvider = null;
        }
    }
}

