/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.connectivity.framework.crawler.web.http;

import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configurable;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
import org.eclipse.smila.connectivity.framework.crawler.web.http.HttpBase;
import org.eclipse.smila.connectivity.framework.crawler.web.http.Response;
import org.eclipse.smila.connectivity.framework.crawler.web.parse.Outlink;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class SitemapParser
implements Configurable {
    private static final Log LOG = LogFactory.getLog(SitemapParser.class);
    private static final Hashtable<String, Outlink[]> CACHE = new Hashtable();
    private static final Outlink[] EMPTY_LINKS = new Outlink[0];
    private static final String[] SITEMAP_FILENAME = new String[]{"sitemap.xml", "sitemap.xml.gz", "sitemap.gz"};
    private Configuration _conf;

    public SitemapParser(Configuration conf) {
        this.setConf(conf);
    }

    @Override
    public void setConf(Configuration conf) {
        this._conf = conf;
    }

    @Override
    public Configuration getConf() {
        return this._conf;
    }

    Outlink[] parseSitemapLinks(byte[] sitemapContent) {
        if (sitemapContent == null) {
            return EMPTY_LINKS;
        }
        ArrayList<Outlink> sitemapLinks = new ArrayList<Outlink>();
        try {
            DocumentBuilder documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            ByteArrayInputStream content = new ByteArrayInputStream(sitemapContent);
            Document document = documentBuilder.parse(content);
            Element element = document.getDocumentElement();
            NodeList childNodeList = element.getChildNodes();
            int i = 0;
            while (i < childNodeList.getLength()) {
                Element sitemapElements;
                String sitemapElementsName;
                Node sitemapNode = childNodeList.item(i);
                if (sitemapNode instanceof Element && "url".equals(sitemapElementsName = (sitemapElements = (Element)sitemapNode).getNodeName())) {
                    NodeList urlItemsNodeList = sitemapElements.getChildNodes();
                    int j = 0;
                    while (j < urlItemsNodeList.getLength()) {
                        Element urlItemElement;
                        String urlItemElementName;
                        Node urlItemNode = urlItemsNodeList.item(j);
                        if (urlItemNode instanceof Element && "loc".equals(urlItemElementName = (urlItemElement = (Element)urlItemNode).getNodeName())) {
                            sitemapLinks.add(new Outlink(urlItemElement.getTextContent(), urlItemElement.getTextContent(), this._conf));
                        }
                        ++j;
                    }
                }
                ++i;
            }
        }
        catch (MalformedURLException malformedURLException) {
            LOG.error((Object)"Error creationg outlink while parsing sitemap");
        }
        catch (Exception exception) {
            LOG.error((Object)"Error parsing sitemap");
            return EMPTY_LINKS;
        }
        Outlink[] linksArray = new Outlink[sitemapLinks.size()];
        sitemapLinks.toArray(linksArray);
        return linksArray;
    }

    public Outlink[] getSitemapLinks(HttpBase http, URL url) {
        return this.getSitemapLinks(http, url, 0);
    }

    private Outlink[] getSitemapLinks(HttpBase http, URL url, int sitemapFilename) {
        String host = url.getHost().toLowerCase();
        Outlink[] sitemapLinks = CACHE.get(host);
        boolean cacheSitemap = true;
        if (sitemapLinks == null) {
            if (LOG.isTraceEnabled()) {
                LOG.trace((Object)("cache miss " + url));
            }
            try {
                URL sitemapUrl = new URL(url, "/" + SITEMAP_FILENAME[sitemapFilename]);
                Response response = http.getResponse(sitemapUrl.toString());
                if (response.getCode() == 200) {
                    byte[] content = response.getContent();
                    sitemapLinks = this.parseSitemapLinks(content);
                } else if (response.getCode() == 403) {
                    sitemapLinks = EMPTY_LINKS;
                } else {
                    if (response.getCode() == 404 && sitemapFilename < SITEMAP_FILENAME.length - 1) {
                        return this.getSitemapLinks(http, url, sitemapFilename + 1);
                    }
                    if (response.getCode() >= 500) {
                        cacheSitemap = false;
                        sitemapLinks = EMPTY_LINKS;
                    } else {
                        sitemapLinks = EMPTY_LINKS;
                    }
                }
            }
            catch (Exception exception) {
                LOG.error((Object)("Couldn't get sitemap.xml for " + url + ": " + exception.toString()));
                cacheSitemap = false;
                sitemapLinks = EMPTY_LINKS;
            }
            if (cacheSitemap) {
                CACHE.put(host, sitemapLinks);
            }
        } else {
            sitemapLinks = EMPTY_LINKS;
        }
        return sitemapLinks;
    }
}

