/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.connectivity.framework.crawler.web.parse.html;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.html.dom.HTMLDocumentImpl;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configurable;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
import org.eclipse.smila.connectivity.framework.crawler.web.metadata.Metadata;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Content;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parse;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseData;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseImpl;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.ParseStatus;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Parser;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.html.DOMBuilder;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.html.DOMContentUtils;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.html.HTMLMetaProcessor;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.html.HTMLMetaTags;
import org.eclipse.smila.connectivity.framework.crawler.web.util.StringUtil;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class HtmlParser
implements Parser,
Configurable {
    private static final String[] CONTENT_TYPES = new String[]{"text/html", "text/plain"};
    private static final int CHUNK_SIZE = 2000;
    private static Pattern s_textPattern = Pattern.compile("text");
    private static Pattern s_htmlPattern = Pattern.compile("html");
    private static Pattern s_metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", 2);
    private static Pattern s_charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 2);
    private final Log _log = LogFactory.getLog(HtmlParser.class);
    private String _defaultCharEncoding;
    private Configuration _configuration;
    private DOMContentUtils _utils;
    private Parser _javascriptParser;

    private static String sniffCharacterEncoding(byte[] content) {
        Matcher charsetMatcher;
        String string;
        int length = content.length < 2000 ? content.length : 2000;
        String str = string = new String(content, 0, 0, length);
        Matcher metaMatcher = s_metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find() && (charsetMatcher = s_charsetPattern.matcher(metaMatcher.group(1))).find()) {
            encoding = new String(charsetMatcher.group(1));
        }
        return encoding;
    }

    @Override
    public Parse getParse(Content content) {
        HTMLMetaTags metaTags = new HTMLMetaTags();
        URL base = null;
        try {
            base = new URL(content.getBaseUrl());
        }
        catch (MalformedURLException exception) {
            return new ParseStatus(exception).getEmptyParse(this.getConf());
        }
        String title = "";
        String text = "";
        Outlink[] outlinks = new Outlink[]{};
        ArrayList<Outlink> links = new ArrayList<Outlink>();
        Metadata metadata = new Metadata();
        DocumentFragment root = null;
        try {
            byte[] contentInOctets = content.getContent();
            InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
            String contentType = content.getMetadata().get("Content-Type");
            if (!s_textPattern.matcher(contentType).find() && !s_htmlPattern.matcher(contentType).find()) {
                ParseStatus status = new ParseStatus(1);
                ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata, metaTags);
                parseData.setConf(this._configuration);
                ParseImpl parse = new ParseImpl(text, parseData);
                return parse;
            }
            String encoding = StringUtil.parseCharacterEncoding(contentType);
            if (encoding != null && !"".equals(encoding)) {
                metadata.set("OriginalCharEncoding", encoding);
                encoding = StringUtil.resolveEncodingAlias(encoding);
                if (encoding != null) {
                    metadata.set("CharEncodingForConversion", encoding);
                    if (this._log.isTraceEnabled()) {
                        this._log.trace((Object)(base + ": setting encoding to " + encoding));
                    }
                }
            }
            if ((encoding == null || "".equals(encoding)) && (encoding = HtmlParser.sniffCharacterEncoding(contentInOctets)) != null) {
                metadata.set("OriginalCharEncoding", encoding);
                encoding = StringUtil.resolveEncodingAlias(encoding);
                if (encoding != null) {
                    metadata.set("CharEncodingForConversion", encoding);
                    if (this._log.isTraceEnabled()) {
                        this._log.trace((Object)(base + ": setting encoding to " + encoding));
                    }
                }
            }
            if (encoding == null) {
                encoding = this._defaultCharEncoding;
                metadata.set("CharEncodingForConversion", this._defaultCharEncoding);
                if (this._log.isTraceEnabled()) {
                    this._log.trace((Object)(base + ": falling back to " + this._defaultCharEncoding));
                }
            }
            input.setEncoding(encoding);
            if (this._log.isTraceEnabled()) {
                this._log.trace((Object)"Parsing...");
            }
            root = this.parse(input);
        }
        catch (IOException exception) {
            return new ParseStatus(exception).getEmptyParse(this.getConf());
        }
        catch (DOMException exception) {
            return new ParseStatus(exception).getEmptyParse(this.getConf());
        }
        catch (SAXException exception) {
            return new ParseStatus(exception).getEmptyParse(this.getConf());
        }
        catch (Exception exception) {
            this._log.error((Object)"Unknown parsing error", (Throwable)exception);
            return new ParseStatus(exception).getEmptyParse(this.getConf());
        }
        HTMLMetaProcessor.getMetaTags(metaTags, root, base);
        if (this._log.isTraceEnabled()) {
            this._log.trace((Object)("Meta tags for " + base + ": " + metaTags.toString()));
        }
        if (!metaTags.getNoIndex()) {
            if (this._log.isDebugEnabled()) {
                this._log.debug((Object)"Getting title");
            }
            StringBuffer textBuffer = new StringBuffer();
            this._utils.getText(textBuffer, root);
            text = textBuffer.toString();
            if (this._log.isDebugEnabled()) {
                this._log.debug((Object)"Getting title");
            }
            StringBuffer titleBuffer = new StringBuffer();
            this._utils.getTitle(titleBuffer, root);
            title = titleBuffer.toString().trim();
        }
        if (!metaTags.getNoFollow()) {
            URL baseTag = this._utils.getBase(root);
            if (this._log.isTraceEnabled()) {
                this._log.trace((Object)"Getting links...");
            }
            if (baseTag == null) {
                this._utils.getOutlinks(base, links, root);
            } else {
                this._utils.getOutlinks(baseTag, links, root);
            }
            if (this._log.isDebugEnabled()) {
                this._log.debug((Object)("found " + links.size() + " outlinks in " + content.getUrl()));
                for (Outlink outlink : links) {
                    this._log.debug((Object)outlink.toString());
                }
            }
        }
        ParseStatus status = new ParseStatus(1);
        if (metaTags.getRefresh()) {
            status.setMinorCode((short)100);
            status.setMessage(metaTags.getRefreshHref().toString());
        }
        if (this._javascriptParser != null) {
            ArrayList<Outlink> javascriptLinks = new ArrayList<Outlink>();
            this._javascriptParser.setConf(this._configuration);
            this._utils.setJavascriptParser(this._javascriptParser);
            this._utils.getJavascriptOutlinks(base.toString(), javascriptLinks, root);
            if (this._log.isDebugEnabled()) {
                this._log.debug((Object)("found " + javascriptLinks.size() + " javascript outlinks in " + content.getUrl()));
                for (Outlink outlink : javascriptLinks) {
                    this._log.debug((Object)outlink.toString());
                }
            }
            links.addAll(javascriptLinks);
        }
        outlinks = links.toArray(new Outlink[links.size()]);
        ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata, metaTags);
        parseData.setConf(this._configuration);
        ParseImpl parse = new ParseImpl(text, parseData);
        return parse;
    }

    private DocumentFragment parse(InputSource input) throws Exception {
        return this.parseTagSoup(input);
    }

    private DocumentFragment parseTagSoup(InputSource input) throws Exception {
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        DocumentFragment frag = doc.createDocumentFragment();
        DOMBuilder builder = new DOMBuilder((Document)doc, frag);
        org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
        reader.setContentHandler((ContentHandler)builder);
        reader.setFeature("http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons", true);
        reader.setFeature("http://www.ccil.org/~cowan/tagsoup/features/bogons-empty", false);
        reader.setProperty("http://xml.org/sax/properties/lexical-handler", (Object)builder);
        reader.parse(input);
        return frag;
    }

    @Override
    public void setConf(Configuration configuration) {
        this._configuration = configuration;
        this._defaultCharEncoding = this.getConf().get("parser.character.encoding.default", "windows-1252");
        this._utils = new DOMContentUtils(configuration);
    }

    @Override
    public Configuration getConf() {
        return this._configuration;
    }

    @Override
    public String[] getContentTypes() {
        return CONTENT_TYPES;
    }

    public void setJavascriptParser(Parser parser) {
        this._javascriptParser = parser;
        if (this._log.isDebugEnabled()) {
            this._log.debug((Object)"Javascript parser bound");
        }
    }

    public void unsetJavascriptParser(Parser parser) {
        this._javascriptParser = null;
        if (this._log.isDebugEnabled()) {
            this._log.debug((Object)"Javascript parser unbound");
        }
    }
}

