/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.processing.pipelets;

import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections.map.MultiValueMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.apache.xerces.xni.parser.XMLParserConfiguration;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.filters.DefaultFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.eclipse.smila.blackboard.Blackboard;
import org.eclipse.smila.blackboard.BlackboardAccessException;
import org.eclipse.smila.blackboard.path.Path;
import org.eclipse.smila.datamodel.id.Id;
import org.eclipse.smila.datamodel.record.Literal;
import org.eclipse.smila.processing.ProcessingException;
import org.eclipse.smila.processing.configuration.PipeletConfiguration;
import org.eclipse.smila.processing.pipelets.ATransformationPipelet;

public class HtmlToTextPipelet
extends ATransformationPipelet {
    private static final String[] DEFAULT_REMOVE_CONTENT_TAGS = new String[]{"applet", "frame", "object", "script", "style"};
    private static final String PROP_REMOVE_CONTENT_TAGS = "removeContentTags";
    private static final String PROP_META = "meta:";
    private static final String PROP_TITLE = "tag:title";
    private String[] _removeContentTags = DEFAULT_REMOVE_CONTENT_TAGS;
    private final Map<String, String> _metaAttributeMapping = new HashMap<String, String>();
    private final Log _log = LogFactory.getLog(this.getClass());

    @Override
    public void configure(PipeletConfiguration configuration) throws ProcessingException {
        String removeContentTagList;
        super.configure(configuration);
        Object removeContentTagValue = configuration.getPropertyFirstValue(PROP_REMOVE_CONTENT_TAGS);
        if (removeContentTagValue != null && (removeContentTagList = removeContentTagValue.toString().trim()).length() > 0) {
            this._removeContentTags = removeContentTagList.split(",");
            this._log.info((Object)("Removing complete content of these tags: " + Arrays.toString(this._removeContentTags)));
        }
        for (PipeletConfiguration.Property property : configuration.getProperties()) {
            if (property.getName().startsWith(PROP_META)) {
                String metaName = property.getName().substring(PROP_META.length());
                String attributeName = property.getValue().toString();
                this._metaAttributeMapping.put(metaName, attributeName);
                continue;
            }
            if (!property.getName().equalsIgnoreCase(PROP_TITLE)) continue;
            String attributeName = property.getValue().toString();
            this._metaAttributeMapping.put(PROP_TITLE, attributeName);
        }
    }

    public Id[] process(Blackboard blackboard, Id[] recordIds) throws ProcessingException {
        if (recordIds != null) {
            Id[] idArray = recordIds;
            int n = recordIds.length;
            int n2 = 0;
            while (n2 < n) {
                Id id = idArray[n2];
                try {
                    MultiValueMap metadata = new MultiValueMap();
                    ArrayList<String> results = new ArrayList<String>();
                    if (this.isReadFromAttribute()) {
                        this.processAttributeValues(blackboard, id, results, metadata);
                    } else {
                        InputStream stream = blackboard.getAttachmentAsStream(id, this._inputName);
                        if (stream != null) {
                            results.add(this.extractText(id, stream, metadata));
                        }
                    }
                    this.storeResults(blackboard, id, results);
                    this.storeMetadata(blackboard, id, metadata);
                }
                catch (BlackboardAccessException ex) {
                    this._log.error((Object)("Error processing ID " + id), (Throwable)ex);
                }
                ++n2;
            }
        }
        return recordIds;
    }

    private void processAttributeValues(Blackboard blackboard, Id id, List<String> results, MultiValueMap metadata) throws BlackboardAccessException, ProcessingException {
        List literals = blackboard.getLiterals(id, this._inputPath);
        if (!literals.isEmpty()) {
            for (Literal literal : literals) {
                String content = literal.getStringValue();
                if (content == null) continue;
                results.add(this.extractText(id, content, metadata));
            }
        }
    }

    private void storeMetadata(Blackboard blackboard, Id id, MultiValueMap metadata) throws BlackboardAccessException {
        if (!metadata.isEmpty()) {
            for (String attributeName : metadata.keySet()) {
                Collection values = metadata.getCollection((Object)attributeName);
                Path path = new Path(attributeName);
                if (values.isEmpty()) continue;
                blackboard.removeLiterals(id, path);
                for (String value : values) {
                    Literal literal = blackboard.createLiteral(id);
                    literal.setStringValue(value);
                    blackboard.addLiteral(id, path, literal);
                }
            }
        }
    }

    private String extractText(Id id, InputStream stream, MultiValueMap metadata) throws ProcessingException {
        StringBuilder result = new StringBuilder();
        XMLParserConfiguration parser = this.createParser(result, metadata);
        try {
            parser.parse(new XMLInputSource(null, id.toString(), null, stream, null));
        }
        catch (Exception e) {
            this._log.error((Object)("error parsing HTML document in record " + id), (Throwable)e);
            throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), (Throwable)e);
        }
        return result.toString();
    }

    private String extractText(Id id, String content, MultiValueMap metadata) throws ProcessingException {
        StringBuilder result = new StringBuilder();
        XMLParserConfiguration parser = this.createParser(result, metadata);
        try {
            parser.parse(new XMLInputSource(null, id.getIdHash(), null, (Reader)new StringReader(content), null));
        }
        catch (Exception e) {
            this._log.error((Object)("error parsing HTML document in record " + id), (Throwable)e);
            throw new ProcessingException("error parsing HTML document in record " + id + ": " + e.toString(), (Throwable)e);
        }
        return result.toString();
    }

    private XMLParserConfiguration createParser(StringBuilder result, MultiValueMap metadata) {
        ElementRemover elementRemover = new ElementRemover();
        String[] stringArray = this._removeContentTags;
        int n = this._removeContentTags.length;
        int n2 = 0;
        while (n2 < n) {
            String tag = stringArray[n2];
            elementRemover.removeElement(tag);
            ++n2;
        }
        CommentRemover commentRemover = new CommentRemover();
        MetadataExtractor metadataExtractor = new MetadataExtractor(metadata);
        PlainTextWriter writer = new PlainTextWriter(result);
        XMLDocumentFilter[] filters = new XMLDocumentFilter[]{commentRemover, metadataExtractor, elementRemover, writer};
        HTMLConfiguration parser = new HTMLConfiguration();
        parser.setProperty("http://cyberneko.org/html/properties/filters", (Object)filters);
        return parser;
    }

    public class CommentRemover
    extends DefaultFilter {
        public void comment(XMLString text, Augmentations augs) {
        }
    }

    public class MetadataExtractor
    extends DefaultFilter {
        private final MultiValueMap _metadata;
        private boolean _inTitleTag;
        private StringBuffer _titleBuffer = new StringBuffer();

        public MetadataExtractor(MultiValueMap metadata) {
            this._metadata = metadata;
        }

        public void startElement(QName element, XMLAttributes attributes, Augmentations augs) {
            super.startElement(element, attributes, augs);
            if ("meta".equalsIgnoreCase(element.localpart)) {
                this.extractMetadata(element, attributes);
            } else if ("title".equalsIgnoreCase(element.localpart)) {
                this._inTitleTag = true;
                this._titleBuffer = new StringBuffer();
            }
        }

        public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) {
            super.emptyElement(element, attributes, augs);
            if ("meta".equalsIgnoreCase(element.localpart)) {
                this.extractMetadata(element, attributes);
            }
        }

        public void endElement(QName element, Augmentations augs) {
            super.endElement(element, augs);
            if ("title".equalsIgnoreCase(element.localpart)) {
                this._inTitleTag = false;
                this.setTitle();
            }
        }

        public void characters(XMLString text, Augmentations augs) {
            super.characters(text, augs);
            if (this._inTitleTag) {
                this._titleBuffer.append(text.toString());
            }
        }

        private void extractMetadata(QName element, XMLAttributes attributes) {
            String attributeName;
            String metaName = null;
            String metaValue = null;
            int i = 0;
            while (i < attributes.getLength()) {
                String attributeName2 = attributes.getLocalName(i);
                if ("name".equalsIgnoreCase(attributeName2)) {
                    metaName = attributes.getValue(i).trim().toLowerCase();
                } else if ("content".equalsIgnoreCase(attributeName2)) {
                    metaValue = attributes.getValue(i);
                }
                ++i;
            }
            if (metaName != null && metaValue != null && (attributeName = (String)HtmlToTextPipelet.this._metaAttributeMapping.get(metaName)) != null) {
                this._metadata.put((Object)attributeName, (Object)metaValue);
            }
        }

        private void setTitle() {
            String title;
            String attributeName = (String)HtmlToTextPipelet.this._metaAttributeMapping.get(HtmlToTextPipelet.PROP_TITLE);
            if (attributeName != null && this._titleBuffer != null && (title = this._titleBuffer.toString().trim()).length() > 0) {
                this._metadata.put((Object)attributeName, (Object)title);
            }
        }
    }

    public class PlainTextWriter
    extends DefaultFilter {
        private StringBuilder _target;

        public PlainTextWriter(StringBuilder target) {
            this._target = target;
        }

        public void characters(XMLString text, Augmentations augs) {
            this._target.append(text.ch, text.offset, text.length);
            super.characters(text, augs);
        }
    }
}

