/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.importing.crawler.web;

import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.VisitedLinksException;
import org.eclipse.smila.importing.VisitedLinksService;
import org.eclipse.smila.importing.crawler.web.Fetcher;
import org.eclipse.smila.importing.crawler.web.LinkExtractor;
import org.eclipse.smila.importing.crawler.web.LinkFilter;
import org.eclipse.smila.importing.crawler.web.RecordProducer;
import org.eclipse.smila.importing.crawler.web.WebCrawlerException;
import org.eclipse.smila.objectstore.ObjectStoreException;
import org.eclipse.smila.taskworker.TaskContext;
import org.eclipse.smila.taskworker.Worker;
import org.eclipse.smila.taskworker.input.Inputs;
import org.eclipse.smila.taskworker.input.RecordInput;
import org.eclipse.smila.taskworker.output.Outputs;
import org.eclipse.smila.taskworker.output.RecordOutput;
import org.eclipse.smila.utils.MaybeRecoverableException;

public class WebCrawlerWorker
implements Worker {
    public static final String NAME = "webCrawler";
    public static final String INPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";
    public static final String OUTPUT_SLOT_LINKS_TO_CRAWL = "linksToCrawl";
    public static final String OUTPUT_SLOT_CRAWLED_RECORDS = "crawledRecords";
    private static final int LINKS_PER_BULK = 10;
    private static final int RECORDS_PER_BULK = 100;
    private static final String BULK_ID_FOR_INITIAL_TASK = "initial";
    private VisitedLinksService _visitedLinks;
    private Fetcher _fetcher;
    private LinkExtractor _linkExtractor;
    private LinkFilter _linkFilter;
    private RecordProducer _recordProducer;
    private final Log _log = LogFactory.getLog(this.getClass());

    public String getName() {
        return NAME;
    }

    public void perform(TaskContext taskContext) throws Exception {
        String source = taskContext.getTaskParameters().getStringValue("dataSource");
        String jobRunId = (String)taskContext.getTask().getProperties().get("jobRunId");
        Inputs inputs = taskContext.getInputs();
        if (inputs.getDataObjectCount("linksToCrawl") == 0) {
            this.initiateCrawling(source, taskContext, jobRunId);
        } else {
            RecordInput linksToCrawl = inputs.getAsRecordInput("linksToCrawl");
            this.crawlLinkRecords(source, linksToCrawl, taskContext, jobRunId);
        }
    }

    private void initiateCrawling(String source, TaskContext taskContext, String jobRunId) throws MaybeRecoverableException {
        OutputBulks outputBulks = new OutputBulks(taskContext.getOutputs(), 1, 100);
        Record initialLinkRecord = DataFactory.DEFAULT.createRecord();
        String startUrl = taskContext.getTaskParameters().getStringValue("startUrl");
        this.setUrl(initialLinkRecord, startUrl);
        this._visitedLinks.clearSource(source);
        this.crawlLinkRecord(source, initialLinkRecord, outputBulks, jobRunId, BULK_ID_FOR_INITIAL_TASK, taskContext);
    }

    private void crawlLinkRecords(String source, RecordInput linksToCrawl, TaskContext taskContext, String jobRunId) throws ObjectStoreException, IOException, MaybeRecoverableException {
        OutputBulks outputBulks = new OutputBulks(taskContext.getOutputs(), 10, 100);
        HashSet<String> urlVisitedInThisTask = new HashSet<String>();
        String inputBulkId = linksToCrawl.getObjectName();
        Record record = linksToCrawl.getRecord();
        while (record != null) {
            if (this.hasNotBeenVisited(source, record, urlVisitedInThisTask, jobRunId, taskContext, inputBulkId)) {
                this.crawlLinkRecord(source, record, outputBulks, jobRunId, inputBulkId, taskContext);
                urlVisitedInThisTask.add(this.getUrl(record));
            }
            record = linksToCrawl.getRecord();
        }
    }

    private boolean hasNotBeenVisited(String source, Record record, Collection<String> urlVisitedInThisTask, String jobRunId, TaskContext taskContext, String inputBulkId) throws VisitedLinksException {
        String url = this.getUrl(record);
        this.debugLogUrl("Check if visited", record);
        boolean notVisited = false;
        if (!urlVisitedInThisTask.contains(url)) {
            notVisited = this.isNotVisitedTimed(source, record, jobRunId, inputBulkId, taskContext);
            this.debugLogUrl("Will visit: " + notVisited, record);
        } else {
            this.debugLogUrl("Duplicate URL in task", record);
        }
        return notVisited;
    }

    private boolean isNotVisitedTimed(String source, Record record, String jobRunId, String inputBulkId, TaskContext taskContext) throws VisitedLinksException {
        String url = this.getUrl(record);
        this.debugLogUrl("Check if visited", record);
        long time = taskContext.getTimestamp();
        try {
            boolean bl = !this._visitedLinks.isVisited(source, url, jobRunId, inputBulkId);
            return bl;
        }
        finally {
            taskContext.measureTime("checkVisitedLinks", time);
        }
    }

    private void crawlLinkRecord(String source, Record linkRecord, OutputBulks outputBulks, String jobRunId, String inputBulkId, TaskContext taskContext) throws MaybeRecoverableException {
        try {
            this.invokeFetcherTimed(linkRecord, taskContext);
            if (this.isNotVisitedTimed(source, linkRecord, jobRunId, inputBulkId, taskContext)) {
                this.extractAndFilterLinks(linkRecord, outputBulks, taskContext);
                this.produceAndWriteRecords(linkRecord, outputBulks, taskContext);
            }
        }
        catch (MaybeRecoverableException ex) {
            if (ex.isRecoverable() || BULK_ID_FOR_INITIAL_TASK.equals(inputBulkId)) {
                throw ex;
            }
            this.logNonRecoverableError(source, linkRecord, taskContext, (Exception)((Object)ex));
        }
        catch (RuntimeException ex) {
            this.logNonRecoverableError(source, linkRecord, taskContext, ex);
        }
    }

    private void produceAndWriteRecords(Record linkRecord, OutputBulks outputBulks, TaskContext taskContext) throws WebCrawlerException {
        Collection<Record> crawledRecords = this.produceRecordsTimed(linkRecord, taskContext);
        for (Record crawledRecord : crawledRecords) {
            outputBulks.addCrawledRecord(crawledRecord);
        }
    }

    private void extractAndFilterLinks(Record linkRecord, OutputBulks outputBulks, TaskContext taskContext) throws WebCrawlerException {
        if (linkRecord.hasAttachment("http.content")) {
            Collection<Record> extractedLinks = this.extractLinksTimed(linkRecord, taskContext);
            Collection<Record> filteredLinks = this.filterLinksTimed(extractedLinks, linkRecord, taskContext);
            for (Record outgoingLink : filteredLinks) {
                outputBulks.addLinkToCrawl(outgoingLink);
            }
        }
    }

    private void invokeFetcherTimed(Record linkRecord, TaskContext taskContext) throws WebCrawlerException {
        this.debugLogUrl("Call fetcher for ", linkRecord);
        long time = taskContext.getTimestamp();
        try {
            this._fetcher.crawl(linkRecord, taskContext.getTaskParameters(), taskContext.getLog());
        }
        finally {
            taskContext.measureTime("fetchResource", time);
        }
    }

    private Collection<Record> extractLinksTimed(Record linkRecord, TaskContext taskContext) throws WebCrawlerException {
        this.debugLogUrl("Extract links from ", linkRecord);
        long time = taskContext.getTimestamp();
        try {
            Collection<Record> collection = this._linkExtractor.extractLinks(linkRecord, taskContext.getTaskParameters(), taskContext.getLog());
            return collection;
        }
        finally {
            taskContext.measureTime("extractLinks", time);
        }
    }

    private Collection<Record> filterLinksTimed(Collection<Record> extractedLinks, Record sourceLink, TaskContext taskContext) throws WebCrawlerException {
        if (this._log.isDebugEnabled()) {
            this._log.debug((Object)("Filter links " + extractedLinks + " extracted from " + this.getUrl(sourceLink)));
        }
        long time = taskContext.getTimestamp();
        try {
            Collection<Record> filteredLinks = this._linkFilter.filterLinks(extractedLinks, sourceLink, taskContext.getTaskParameters(), taskContext.getLog());
            if (this._log.isDebugEnabled()) {
                this._log.debug((Object)("Remaining links: " + filteredLinks));
            }
            Collection<Record> collection = filteredLinks;
            return collection;
        }
        finally {
            taskContext.measureTime("filterLink", time);
        }
    }

    private Collection<Record> produceRecordsTimed(Record crawledRecord, TaskContext taskContext) throws WebCrawlerException {
        this.debugLogUrl("Produce record for ", crawledRecord);
        long time = taskContext.getTimestamp();
        try {
            Collection<Record> collection = this._recordProducer.produceRecords(crawledRecord, taskContext.getTaskParameters(), taskContext.getLog());
            return collection;
        }
        finally {
            taskContext.measureTime("produceRecords", time);
        }
    }

    private void debugLogUrl(String message, Record link) {
        if (this._log.isDebugEnabled()) {
            this._log.debug((Object)(String.valueOf(message) + " " + this.getUrl(link)));
        }
    }

    private void logNonRecoverableError(String source, Record linkRecord, TaskContext taskContext, Exception ex) {
        taskContext.getLog().warn("Error crawling link " + this.getUrl(linkRecord) + " in source " + source + ", skipping.", (Throwable)ex);
    }

    private String getUrl(Record record) {
        return record.getMetadata().getStringValue("http.url");
    }

    private void setUrl(Record record, String startUrl) {
        record.getMetadata().put("http.url", startUrl);
    }

    public void setVisitedLinks(VisitedLinksService visitedLinks) {
        this._visitedLinks = visitedLinks;
    }

    public void unsetVisitedLinks(VisitedLinksService visitedLinks) {
        if (this._visitedLinks == visitedLinks) {
            this._visitedLinks = null;
        }
    }

    public void setFetcher(Fetcher fetcher) {
        this._fetcher = fetcher;
    }

    public void unsetFetcher(Fetcher fetcher) {
        if (this._fetcher == fetcher) {
            this._fetcher = null;
        }
    }

    public void setLinkExtractor(LinkExtractor linkExtractor) {
        this._linkExtractor = linkExtractor;
    }

    public void unsetLinkExtractor(LinkExtractor linkExtractor) {
        if (this._linkExtractor == linkExtractor) {
            this._linkExtractor = null;
        }
    }

    public void setLinkFilter(LinkFilter linkFilter) {
        this._linkFilter = linkFilter;
    }

    public void unsetLinkFilter(LinkFilter linkFilter) {
        if (this._linkFilter == linkFilter) {
            this._linkFilter = null;
        }
    }

    public void setRecordProducer(RecordProducer recordProducer) {
        this._recordProducer = recordProducer;
    }

    public void unsetRecordProducer(RecordProducer recordProducer) {
        if (this._recordProducer == recordProducer) {
            this._recordProducer = null;
        }
    }

    private static class OutputBulks {
        private final Outputs _outputs;
        private final int _linksPerBulk;
        private final int _recordsPerBulk;
        private RecordOutput _linksToCrawl;
        private int _linksToCrawlBulkIndex;
        private RecordOutput _crawledRecords;
        private int _crawledRecordsBulkIndex;

        OutputBulks(Outputs outputs, int linksPerBulk, int recordsPerBulk) {
            this._outputs = outputs;
            this._linksPerBulk = linksPerBulk;
            this._recordsPerBulk = recordsPerBulk;
        }

        void addLinkToCrawl(Record record) throws WebCrawlerException {
            try {
                if (this._linksToCrawl == null) {
                    this._linksToCrawl = this._outputs.getAsRecordOutput("linksToCrawl");
                } else if (this._linksToCrawl.getRecordCount() >= (long)this._linksPerBulk) {
                    this._linksToCrawl.commit();
                    ++this._linksToCrawlBulkIndex;
                    this._linksToCrawl = this._outputs.getAsRecordOutput("linksToCrawl", this._linksToCrawlBulkIndex);
                }
                this._linksToCrawl.writeRecord(record);
            }
            catch (Exception ex) {
                throw new WebCrawlerException("Error writing to linksToCrawl bulk", ex);
            }
        }

        void addCrawledRecord(Record record) throws WebCrawlerException {
            try {
                if (this._crawledRecords == null) {
                    this._crawledRecords = this._outputs.getAsRecordOutput(WebCrawlerWorker.OUTPUT_SLOT_CRAWLED_RECORDS);
                } else if (this._crawledRecords.getRecordCount() >= (long)this._recordsPerBulk) {
                    this._crawledRecords.commit();
                    ++this._crawledRecordsBulkIndex;
                    this._crawledRecords = this._outputs.getAsRecordOutput("linksToCrawl", this._crawledRecordsBulkIndex);
                }
                this._crawledRecords.writeRecord(record);
            }
            catch (Exception ex) {
                throw new WebCrawlerException("Error writing to crawledRecords bulk", ex);
            }
        }
    }
}

