/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.importing.crawler.web;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.RedirectException;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.VisitedLinksException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.RecordProducer;
import org.eclipse.smila.importing.crawler.web.WebCrawlerConstants;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.importing.crawler.web.WebCrawlingContext;
import org.eclipse.smila.importing.crawler.web.filter.FilterConfiguration;
import org.eclipse.smila.importing.crawler.web.utils.UriHelper;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.taskworker.input.Inputs;
import org.eclipse.smila.taskworker.input.RecordInput;
import org.eclipse.smila.taskworker.output.Outputs;
import org.eclipse.smila.taskworker.output.RecordOutput;
import org.eclipse.smila.utils.MaybeRecoverableException;

public class WebCrawlerWorker
implements Worker {
    public static final String NAME = "webCrawler";
    public static final String INPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";
    public static final String OUTPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";
    public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";
    private static final int RECORDS_PER_BULK = 100;
    private static final String BULK_ID_FOR_INITIAL_TASK = "initial";
    private VisitedLinksService _visitedLinks;
    private Fetcher _fetcher;
    private LinkExtractor _linkExtractor;
    private LinkFilter _linkFilter;
    private RecordProducer _recordProducer;
    private final Log _log = LogFactory.getLog(this.getClass());
    private CompoundExtractor _compoundExtractor;

    public static String getMimeType(Record record) {
        return record.getMetadata().getStringValue("httpMimetype");
    }

    public String getName() {
        return NAME;
    }

    public void perform(TaskContext taskContext) throws Exception {
        Inputs inputs = taskContext.getInputs();
        WebCrawlingContext webCrawlingContext = new WebCrawlingContext(taskContext);
        if (inputs.getDataObjectCount("linksToCrawl") == 0) {
            this.initiateCrawling(webCrawlingContext);
        } else {
            RecordInput linksToCrawl = inputs.getAsRecordInput("linksToCrawl");
            this.crawlLinkRecords(linksToCrawl, webCrawlingContext);
        }
    }

    private void initiateCrawling(WebCrawlingContext webCrawlingContext) throws MaybeRecoverableException {
        RecordOutputHandler outputBulks = new RecordOutputHandler(webCrawlingContext.getTaskContext().getOutputs(), 1, 100);
        Record initialLinkRecord = DataFactory.DEFAULT.createRecord();
        String startUrl = webCrawlingContext.getTaskParameters().getStringValue("startUrl");
        this.setUrl(initialLinkRecord, startUrl);
        this.setCrawlDepth(initialLinkRecord, 0L);
        this._visitedLinks.clearSource(webCrawlingContext.getDataSource());
        webCrawlingContext.setCurrentInputBulkId(BULK_ID_FOR_INITIAL_TASK);
        this.crawlLinkRecord(initialLinkRecord, outputBulks, webCrawlingContext);
    }

    private void crawlLinkRecords(RecordInput linksToCrawl, WebCrawlingContext context) throws ObjectStoreException, IOException, MaybeRecoverableException {
        Long sleepTime = context.getTaskParameters().getLongValue("waitBetweenRequests");
        RecordOutputHandler outputBulks = new RecordOutputHandler(context.getTaskContext().getOutputs(), context.getLinksPerBulk(), 100);
        String inputBulkId = linksToCrawl.getObjectName();
        context.setCurrentInputBulkId(inputBulkId);
        Record record = linksToCrawl.getRecord();
        while (record != null && !context.getTaskContext().isCanceled()) {
            this.normalizeUrl(record, context);
            if (this.hasNotBeenVisited(record, context)) {
                this.waitBetweenRequests(sleepTime);
                this.crawlLinkRecord(record, outputBulks, context);
                context.getVisitedUrls().add(this.getUrl(record));
            }
            record = linksToCrawl.getRecord();
        }
    }

    private void normalizeUrl(Record record, WebCrawlingContext context) throws MaybeRecoverableException {
        try {
            this.setUrl(record, this.getUrl(record));
        }
        catch (MaybeRecoverableException ex) {
            this.handleCrawlException(record, context, ex);
        }
    }

    private boolean hasNotBeenVisited(Record record, WebCrawlingContext context) throws VisitedLinksException {
        String url = this.getUrl(record);
        this.debugLogUrl("Check if visited", record);
        boolean notVisited = false;
        if (!context.getVisitedUrls().contains(url)) {
            notVisited = !this.checkAndMarkVisitedTimed(record, context);
            this.debugLogUrl("Will visit: " + notVisited, record);
        } else {
            this.debugLogUrl("Duplicate URL in task", record);
        }
        return notVisited;
    }

    private boolean checkAndMarkVisitedTimed(Record record, WebCrawlingContext context) throws VisitedLinksException {
        String url = this.getUrl(record);
        this.debugLogUrl("Check if visited", record);
        long time = context.getTaskContext().getTimestamp();
        try {
            boolean bl = this._visitedLinks.checkAndMarkVisited(context.getDataSource(), url, context.getJobRunId(), context.getCurrentInputBulkId());
            return bl;
        }
        finally {
            context.getTaskContext().measureTime("checkVisitedLinks", time);
        }
    }

    private void crawlLinkRecord(Record linkRecord, RecordOutputHandler outputBulks, WebCrawlingContext context) throws MaybeRecoverableException {
        try {
            this.invokeFetcherTimed(linkRecord, context);
            if (!this.checkAndMarkVisitedTimed(linkRecord, context)) {
                this.extractAndFilterLinks(linkRecord, outputBulks, context);
                this.produceAndWriteRecords(linkRecord, outputBulks, context);
            }
        }
        catch (MaybeRecoverableException ex) {
            this.handleCrawlException(linkRecord, context, ex);
        }
        catch (RuntimeException ex) {
            this.logNonRecoverableError(linkRecord, ex, context);
        }
    }

    private void handleCrawlException(Record linkRecord, WebCrawlingContext context, MaybeRecoverableException ex) throws MaybeRecoverableException {
        if (ex.isRecoverable() && context.getErrorHandling() == WebCrawlerConstants.ErrorHandling.RETRY) {
            throw ex;
        }
        if (BULK_ID_FOR_INITIAL_TASK.equals(context.getCurrentInputBulkId())) {
            if (ex.isRecoverable() && context.getErrorHandling() != WebCrawlerConstants.ErrorHandling.RETRY) {
                throw new WebCrawlerException(ex.getMessage(), ex, false);
            }
            throw ex;
        }
        if (ex.getCause() != null && ex.getCause() instanceof RedirectException) {
            this.logRedirectErrors(linkRecord, context, ex);
        } else {
            this.logNonRecoverableError(linkRecord, (Exception)((Object)ex), context);
        }
    }

    private void logRedirectErrors(Record linkRecord, WebCrawlingContext context, MaybeRecoverableException ex) {
        FilterConfiguration filterConfig = context.getFilterConfiguration();
        if (filterConfig == null || !filterConfig.followRedirects()) {
            this._log.info((Object)ex.getLocalizedMessage());
        } else {
            this.logNonRecoverableError(linkRecord, (Exception)((Object)ex), context);
        }
    }

    private void produceAndWriteRecords(Record linkRecord, RecordOutputHandler outputBulks, WebCrawlingContext context) throws WebCrawlerException {
        Collection<Record> crawledRecords = this.produceRecordsTimed(linkRecord, context);
        for (Record crawledRecord : crawledRecords) {
            if (this.isCompoundRecord(crawledRecord, context)) {
                this.setIsCompound(crawledRecord);
            }
            outputBulks.mapAndAddCrawledRecord(crawledRecord, context);
        }
    }

    private boolean isCompoundRecord(Record record, WebCrawlingContext context) {
        return this._compoundExtractor.canExtract(this.getUrl(record), WebCrawlerWorker.getMimeType(record));
    }

    private void extractAndFilterLinks(Record linkRecord, RecordOutputHandler outputBulks, WebCrawlingContext context) throws WebCrawlerException, VisitedLinksException {
        if (linkRecord.hasAttachment("httpContent")) {
            Collection<Record> extractedLinks = this.extractLinksTimed(linkRecord, context);
            Collection<Record> filteredLinks = this.filterLinksTimed(extractedLinks, linkRecord, context);
            for (Record outgoingLink : filteredLinks) {
                if (!context.getExtractedUrls().add(this.getUrl(outgoingLink)) || !this.isNotVisitedTimed(outgoingLink, context)) continue;
                outputBulks.addLinkToCrawl(outgoingLink);
            }
        }
    }

    private void invokeFetcherTimed(Record linkRecord, WebCrawlingContext context) throws WebCrawlerException {
        this.debugLogUrl("Call fetcher for ", linkRecord);
        long time = context.getTaskContext().getTimestamp();
        try {
            this._fetcher.crawl(this.getUrl(linkRecord), linkRecord, context);
        }
        finally {
            context.getTaskContext().measureTime("fetchResource", time);
        }
    }

    private Collection<Record> extractLinksTimed(Record linkRecord, WebCrawlingContext context) throws WebCrawlerException {
        this.debugLogUrl("Extract links from ", linkRecord);
        long time = context.getTaskContext().getTimestamp();
        try {
            Collection<Record> collection = this._linkExtractor.extractLinks(linkRecord, context);
            return collection;
        }
        finally {
            context.getTaskContext().measureTime("extractLinks", time);
        }
    }

    private Collection<Record> filterLinksTimed(Collection<Record> extractedLinks, Record sourceLink, WebCrawlingContext context) throws WebCrawlerException {
        if (this._log.isDebugEnabled()) {
            this._log.debug((Object)("Filter links " + extractedLinks + " extracted from " + this.getUrl(sourceLink)));
        }
        long time = context.getTaskContext().getTimestamp();
        try {
            long maxCrawlDepth = context.getFilterConfiguration().getMaxCrawlDepth();
            long crawlDepth = this.getCrawlDepth(sourceLink);
            if (crawlDepth >= maxCrawlDepth) {
                List<Record> list = Collections.emptyList();
                return list;
            }
            for (Record rec : extractedLinks) {
                this.setCrawlDepth(rec, crawlDepth + 1L);
            }
            Collection<Record> filteredLinks = this._linkFilter.filterExtractedLinks(extractedLinks, this.getUrl(sourceLink), context);
            if (this._log.isDebugEnabled()) {
                this._log.debug((Object)("Remaining links: " + filteredLinks));
            }
            Collection<Record> collection = filteredLinks;
            return collection;
        }
        finally {
            context.getTaskContext().measureTime("filterLink", time);
        }
    }

    private boolean isNotVisitedTimed(Record record, WebCrawlingContext context) throws VisitedLinksException {
        String url = this.getUrl(record);
        this.debugLogUrl("Check if extracted link is visited", record);
        long time = context.getTaskContext().getTimestamp();
        try {
            boolean bl = !this._visitedLinks.isVisited(context.getDataSource(), url, context.getJobRunId());
            return bl;
        }
        finally {
            context.getTaskContext().measureTime("checkVisitedLinks", time);
        }
    }

    private Collection<Record> produceRecordsTimed(Record crawledRecord, WebCrawlingContext context) throws WebCrawlerException {
        this.debugLogUrl("Produce record for ", crawledRecord);
        long time = context.getTaskContext().getTimestamp();
        try {
            Collection<Record> collection = this._recordProducer.produceRecords(crawledRecord, context);
            return collection;
        }
        finally {
            context.getTaskContext().measureTime("produceRecords", time);
        }
    }

    private void debugLogUrl(String message, Record link) {
        if (this._log.isDebugEnabled()) {
            this._log.debug((Object)(String.valueOf(message) + " " + this.getUrl(link)));
        }
    }

    private void logNonRecoverableError(Record linkRecord, Exception ex, WebCrawlingContext context) {
        context.getTaskLog().warn("Error crawling link " + this.getUrl(linkRecord) + " in source " + context.getDataSource() + ", skipping.", (Throwable)ex);
    }

    private String getUrl(Record record) {
        return record.getMetadata().getStringValue("httpUrl");
    }

    private void setUrl(Record record, String url) throws MaybeRecoverableException {
        try {
            record.getMetadata().put("httpUrl", UriHelper.normalizeUrl(null, url));
        }
        catch (URISyntaxException ex) {
            throw new MaybeRecoverableException((Throwable)ex, false);
        }
    }

    private void setCrawlDepth(Record record, long crawlDepth) {
        record.getMetadata().put("_crawlDepth", (Number)crawlDepth);
    }

    private long getCrawlDepth(Record record) {
        if (record.getMetadata().containsKey((Object)"_crawlDepth")) {
            return record.getMetadata().getLongValue("_crawlDepth");
        }
        return 0L;
    }

    private void setIsCompound(Record record) {
        record.getMetadata().put("_isCompound", Boolean.valueOf(true));
    }

    private void waitBetweenRequests(Long sleepTime) {
        if (sleepTime != null && sleepTime > 0L) {
            try {
                Thread.sleep(sleepTime);
            }
            catch (InterruptedException interruptedException) {}
        }
    }

    public void setVisitedLinks(VisitedLinksService visitedLinks) {
        this._visitedLinks = visitedLinks;
    }

    public void unsetVisitedLinks(VisitedLinksService visitedLinks) {
        if (this._visitedLinks == visitedLinks) {
            this._visitedLinks = null;
        }
    }

    public void setFetcher(Fetcher fetcher) {
        this._fetcher = fetcher;
    }

    public void unsetFetcher(Fetcher fetcher) {
        if (this._fetcher == fetcher) {
            this._fetcher = null;
        }
    }

    public void setLinkExtractor(LinkExtractor linkExtractor) {
        this._linkExtractor = linkExtractor;
    }

    public void unsetLinkExtractor(LinkExtractor linkExtractor) {
        if (this._linkExtractor == linkExtractor) {
            this._linkExtractor = null;
        }
    }

    public void setLinkFilter(LinkFilter linkFilter) {
        this._linkFilter = linkFilter;
    }

    public void unsetLinkFilter(LinkFilter linkFilter) {
        if (this._linkFilter == linkFilter) {
            this._linkFilter = null;
        }
    }

    public void setRecordProducer(RecordProducer recordProducer) {
        this._recordProducer = recordProducer;
    }

    public void unsetRecordProducer(RecordProducer recordProducer) {
        if (this._recordProducer == recordProducer) {
            this._recordProducer = null;
        }
    }

    public void setCompoundExtractor(CompoundExtractor compoundExtractor) {
        this._compoundExtractor = compoundExtractor;
    }

    public void unsetCompoundExtractor(CompoundExtractor compoundExtractor) {
        if (this._compoundExtractor == compoundExtractor) {
            this._compoundExtractor = null;
        }
    }

    private static final class RecordOutputHandler {
        private final Outputs _outputs;
        private final int _linksPerBulk;
        private final int _recordsPerBulk;
        private RecordOutput _linksToCrawl;
        private int _linksToCrawlBulkIndex;
        private RecordOutput _crawledRecords;
        private int _crawledRecordsBulkIndex;

        private RecordOutputHandler(Outputs outputs, int linksPerBulk, int recordsPerBulk) {
            this._outputs = outputs;
            this._linksPerBulk = linksPerBulk;
            this._recordsPerBulk = recordsPerBulk;
        }

        private void addLinkToCrawl(Record record) throws WebCrawlerException {
            try {
                if (this._linksToCrawl == null) {
                    this._linksToCrawl = this._outputs.getAsRecordOutput("linksToCrawl");
                } else if (this._linksToCrawl.getRecordCount() >= (long)this._linksPerBulk) {
                    this._linksToCrawl.commit();
                    ++this._linksToCrawlBulkIndex;
                    this._linksToCrawl = this._outputs.getAsRecordOutput("linksToCrawl", this._linksToCrawlBulkIndex);
                }
                this._linksToCrawl.writeRecord(record);
            }
            catch (Exception ex) {
                throw new WebCrawlerException("Error writing to linksToCrawl bulk", ex);
            }
        }

        private void mapAndAddCrawledRecord(Record record, WebCrawlingContext context) throws WebCrawlerException {
            try {
                if (this._crawledRecords == null) {
                    this._crawledRecords = this._outputs.getAsRecordOutput(WebCrawlerWorker.OUTPUT_SLOT_CRAWLED_RECORDS);
                } else if (this._crawledRecords.getRecordCount() >= (long)this._recordsPerBulk) {
                    this._crawledRecords.commit();
                    ++this._crawledRecordsBulkIndex;
                    this._crawledRecords = this._outputs.getAsRecordOutput(WebCrawlerWorker.OUTPUT_SLOT_CRAWLED_RECORDS, this._crawledRecordsBulkIndex);
                }
                context.getMapper().mapNames(record, WebCrawlerConstants.PROPERTY_NAMES);
                this._crawledRecords.writeRecord(record);
            }
            catch (Exception ex) {
                throw new WebCrawlerException("Error writing to crawledRecords bulk", ex);
            }
        }
    }
}

