/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.processing.pipelets;

import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections.map.MultiValueMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.filters.DefaultFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.Value;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.parameters.MissingParameterException;
import org.eclipse.smila.processing.parameters.ParameterAccessor;
import org.eclipse.smila.processing.pipelets.ATransformationPipelet;
import org.eclipse.smila.processing.pipelets.SourceType;
import org.eclipse.smila.processing.util.ResultCollector;

public class HtmlToTextPipelet
extends ATransformationPipelet {
    private static final String[] DEFAULT_REMOVE_CONTENT_TAGS = new String[]{"applet", "frame", "object", "script", "style"};
    private static final String PROP_REMOVE_CONTENT_TAGS = "removeContentTags";
    private static final String PROP_DEFAULT_ENCODING = "defaultEncoding";
    private static final String PROP_META = "meta:";
    private static final String PROP_TITLE = "tag:title";
    private final Map<String, String> _metaAttributeMapping = new HashMap<String, String>();
    private final Log _log = LogFactory.getLog(this.getClass());

    protected String getDefaultEncoding(ParameterAccessor paramAccessor) throws MissingParameterException {
        return paramAccessor.getParameter(PROP_DEFAULT_ENCODING, null);
    }

    protected final String[] getRemoveContentTags(ParameterAccessor paramAccessor) throws MissingParameterException {
        String removeContentTagList;
        Object[] removeContentTags = DEFAULT_REMOVE_CONTENT_TAGS;
        Any removeContentTagValue = paramAccessor.getParameterAny(PROP_REMOVE_CONTENT_TAGS);
        if (removeContentTagValue != null && removeContentTagValue.isString() && (removeContentTagList = ((Value)removeContentTagValue).asString().trim()).length() > 0) {
            removeContentTags = removeContentTagList.split(",");
            this._log.info((Object)("Removing complete content of these tags: " + Arrays.toString(removeContentTags)));
        }
        return removeContentTags;
    }

    @Override
    public void configure(AnyMap configuration) throws ProcessingException {
        super.configure(configuration);
        for (Map.Entry entry : configuration.entrySet()) {
            String key = (String)entry.getKey();
            String value = ((Value)entry.getValue()).asString();
            if (key.startsWith(PROP_META)) {
                String metaName = key.substring(PROP_META.length());
                String attributeName = value;
                this._metaAttributeMapping.put(metaName, attributeName);
                continue;
            }
            if (!key.equalsIgnoreCase(PROP_TITLE)) continue;
            String attributeName = value;
            this._metaAttributeMapping.put(PROP_TITLE, attributeName);
        }
    }

    public String[] process(Blackboard blackboard, String[] recordIds) throws ProcessingException {
        ParameterAccessor paramAccessor = new ParameterAccessor(blackboard, this._config);
        ResultCollector resultCollector = new ResultCollector(paramAccessor, this._log, false);
        if (recordIds != null) {
            String[] stringArray = recordIds;
            int n = recordIds.length;
            int n2 = 0;
            while (n2 < n) {
                String id = stringArray[n2];
                try {
                    paramAccessor.setCurrentRecord(id);
                    SourceType inputType = this.getInputType(paramAccessor);
                    String inputName = this.getInputName(paramAccessor);
                    MultiValueMap metadata = new MultiValueMap();
                    ArrayList<String> results = new ArrayList<String>();
                    if (this.isReadFromAttribute(inputType)) {
                        this.processAttributeValues(blackboard, id, results, metadata, paramAccessor);
                    } else {
                        InputStream stream = blackboard.getAttachmentAsStream(id, inputName);
                        if (stream != null) {
                            results.add(this.extractText(id, stream, metadata, paramAccessor));
                        }
                    }
                    this.storeResults(blackboard, id, results, paramAccessor);
                    this.storeMetadata(blackboard, id, metadata);
                    resultCollector.addResult(id);
                }
                catch (Exception ex) {
                    resultCollector.addFailedResult(id, ex);
                }
                ++n2;
            }
        }
        return resultCollector.getResultIds();
    }

    private void processAttributeValues(Blackboard blackboard, String id, List<String> results, MultiValueMap metadata, ParameterAccessor paramAccessor) throws BlackboardAccessException, ProcessingException {
        AnyMap anyMap = blackboard.getMetadata(id);
        Any any = (Any)anyMap.get((Object)this.getInputName(paramAccessor));
        if (any != null) {
            for (Any value : any) {
                if (value.isValue()) {
                    String content = ((Value)value).asString();
                    if (content == null) continue;
                    results.add(this.extractText(id, content, metadata, paramAccessor));
                    continue;
                }
                if (!any.isSeq()) continue;
                AnySeq sequence = (AnySeq)any;
                for (Any element : sequence) {
                    String content;
                    if (!element.isString() || (content = ((Value)element).asString()) == null) continue;
                    results.add(this.extractText(id, content, metadata, paramAccessor));
                }
            }
        }
    }

    private void storeMetadata(Blackboard blackboard, String id, MultiValueMap metadata) throws BlackboardAccessException {
        if (!metadata.isEmpty()) {
            for (String attributeName : metadata.keySet()) {
                Collection values = metadata.getCollection((Object)attributeName);
                AnyMap anyMap = blackboard.getMetadata(id);
                if (values.isEmpty()) continue;
                anyMap.remove((Object)attributeName);
                AnySeq sequence = blackboard.getDataFactory().createAnySeq();
                for (String value : values) {
                    sequence.add(value);
                }
                anyMap.put(attributeName, (Any)sequence);
            }
        }
    }

    private String extractText(String id, InputStream stream, MultiValueMap metadata, ParameterAccessor paramAccessor) throws ProcessingException {
        StringBuilder result = new StringBuilder();
        XMLParserConfiguration parser = this.createParser(result, metadata, paramAccessor);
        try {
            parser.parse(new XMLInputSource(null, id, null, stream, null));
        }
        catch (Exception e) {
            this._log.error((Object)("error parsing HTML document in record " + id), (Throwable)e);
            throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), (Throwable)e);
        }
        return result.toString();
    }

    private String extractText(String id, String content, MultiValueMap metadata, ParameterAccessor paramAccessor) throws ProcessingException {
        StringBuilder result = new StringBuilder();
        XMLParserConfiguration parser = this.createParser(result, metadata, paramAccessor);
        try {
            parser.parse(new XMLInputSource(null, id, null, (Reader)new StringReader(content), null));
        }
        catch (Exception e) {
            this._log.error((Object)("error parsing HTML document in record " + id), (Throwable)e);
            throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), (Throwable)e);
        }
        return result.toString();
    }

    private XMLParserConfiguration createParser(StringBuilder result, MultiValueMap metadata, ParameterAccessor paramAccessor) throws MissingParameterException {
        ElementRemover elementRemover = new ElementRemover();
        String[] stringArray = this.getRemoveContentTags(paramAccessor);
        int n = stringArray.length;
        int n2 = 0;
        while (n2 < n) {
            String tag = stringArray[n2];
            elementRemover.removeElement(tag);
            ++n2;
        }
        CommentRemover commentRemover = new CommentRemover();
        MetadataExtractor metadataExtractor = new MetadataExtractor(metadata);
        PlainTextWriter writer = new PlainTextWriter(result);
        XMLDocumentFilter[] filters = new XMLDocumentFilter[]{commentRemover, metadataExtractor, elementRemover, writer};
        HTMLConfiguration parser = new HTMLConfiguration();
        parser.setProperty("http://cyberneko.org/html/properties/filters", (Object)filters);
        if (this.getDefaultEncoding(paramAccessor) != null) {
            parser.setProperty("http://cyberneko.org/html/properties/default-encoding", (Object)this.getDefaultEncoding(paramAccessor));
        }
        return parser;
    }

    public class CommentRemover
    extends DefaultFilter {
        public void comment(XMLString text, Augmentations augs) {
        }
    }

    public class MetadataExtractor
    extends DefaultFilter {
        private final MultiValueMap _metadata;
        private boolean _inTitleTag;
        private StringBuffer _titleBuffer = new StringBuffer();

        public MetadataExtractor(MultiValueMap metadata) {
            this._metadata = metadata;
        }

        public void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
            super.startElement(element, attributes, augs);
            if ("meta".equalsIgnoreCase(element.localpart)) {
                this.extractMetadata(element, attributes);
            } else if ("title".equalsIgnoreCase(element.localpart)) {
                this._inTitleTag = true;
                this._titleBuffer = new StringBuffer();
            }
        }

        public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) {
            super.emptyElement(element, attributes, augs);
            if ("meta".equalsIgnoreCase(element.localpart)) {
                this.extractMetadata(element, attributes);
            }
        }

        public void endElement(QName element, Augmentations augs) {
            super.endElement(element, augs);
            if ("title".equalsIgnoreCase(element.localpart)) {
                this._inTitleTag = false;
                this.setTitle();
            }
        }

        public void characters(XMLString text, Augmentations augs) {
            super.characters(text, augs);
            if (this._inTitleTag) {
                this._titleBuffer.append(text.toString());
            }
        }

        private void extractMetadata(QName element, XMLAttributes attributes) {
            String attributeName;
            String metaName = null;
            String metaValue = null;
            int i = 0;
            while (i < attributes.getLength()) {
                String attributeName2 = attributes.getLocalName(i);
                if ("name".equalsIgnoreCase(attributeName2)) {
                    metaName = attributes.getValue(i).trim().toLowerCase();
                } else if ("content".equalsIgnoreCase(attributeName2)) {
                    metaValue = attributes.getValue(i);
                }
                ++i;
            }
            if (metaName != null && metaValue != null && (attributeName = (String)HtmlToTextPipelet.this._metaAttributeMapping.get(metaName)) != null) {
                this._metadata.put((Object)attributeName, (Object)metaValue);
            }
        }

        private void setTitle() {
            String title;
            String attributeName = (String)HtmlToTextPipelet.this._metaAttributeMapping.get(HtmlToTextPipelet.PROP_TITLE);
            if (attributeName != null && this._titleBuffer != null && (title = this._titleBuffer.toString().trim()).length() > 0) {
                this._metadata.put((Object)attributeName, (Object)title);
            }
        }
    }

    public class PlainTextWriter
    extends DefaultFilter {
        private final StringBuilder _target;

        public PlainTextWriter(StringBuilder target) {
            this._target = target;
        }

        public void characters(XMLString text, Augmentations augs) {
            this._target.append(text.ch, text.offset, text.length);
            super.characters(text, augs);
        }
    }
}

