/*
 * Decompiled with CFR 0.152.
 */
package org.eclipse.smila.connectivity.framework.crawler.web.http;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configurable;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
import org.eclipse.smila.connectivity.framework.crawler.web.http.HttpBase;
import org.eclipse.smila.connectivity.framework.crawler.web.http.Response;
import org.eclipse.smila.connectivity.framework.crawler.web.messages.Robotstxt;

public class RobotRulesParser
implements Configurable {
    private static final int BUFSIZE = 2048;
    private static final String USER_AGENT = "User-agent:";
    private static final String ALLOW = "Allow:";
    private static final String DISALLOW = "Disallow:";
    private static final String CRAWL_DELAY = "Crawl-delay:";
    private static final String COLON = ":";
    private static final String SEMICOLON = ";";
    private static final Log LOG = LogFactory.getLog(RobotRulesParser.class);
    private static final Hashtable<String, RobotRuleSet> CACHE = new Hashtable();
    private static final String CHARACTER_ENCODING = "UTF-8";
    private static final int NO_PRECEDENCE = Integer.MAX_VALUE;
    private static final RobotRuleSet EMPTY_RULES = new RobotRuleSet();
    private static final RobotRuleSet FORBID_ALL_RULES = RobotRulesParser.getForbidAllRules();
    private boolean _allowForbidden;
    private Configuration _conf;
    private Robotstxt _policy;
    private String _policyValue;
    private Map<String, Integer> _robotNames;

    RobotRulesParser() {
    }

    RobotRulesParser(String[] robotNames) {
        this.setRobotNames(robotNames);
    }

    public RobotRulesParser(Configuration conf) {
        this.setConf(conf);
    }

    @Override
    public void setConf(Configuration conf) {
        this._conf = conf;
        this._policy = Robotstxt.valueOf(conf.get("http.robotstxt.policy").toUpperCase());
        this._policyValue = conf.get("http.robotstxt.value");
        this._allowForbidden = conf.getBoolean("http.robots.403.allow", false);
        String agentName = conf.get("http.agent.name");
        String agentNames = conf.get("http.robots.agents");
        StringTokenizer tok = new StringTokenizer(agentNames, SEMICOLON);
        ArrayList<String> agents = new ArrayList<String>();
        while (tok.hasMoreTokens()) {
            agents.add(tok.nextToken().trim());
        }
        if (agents.size() == 0) {
            agents.add(agentName);
            if (LOG.isDebugEnabled()) {
                LOG.debug((Object)"No agents listed in AgentNames attribute!");
            }
        } else if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
            agents.add(0, agentName);
            if (LOG.isDebugEnabled()) {
                LOG.debug((Object)("Agent we advertise (" + agentName + ") not listed first in AgentNames attribute!"));
            }
        }
        this.setRobotNames(agents.toArray(new String[agents.size()]));
    }

    @Override
    public Configuration getConf() {
        return this._conf;
    }

    private void setRobotNames(String[] robotNames) {
        this._robotNames = new HashMap<String, Integer>();
        int i = 0;
        while (i < robotNames.length) {
            this._robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
            ++i;
        }
        if (!this._robotNames.containsKey("*")) {
            this._robotNames.put("*", new Integer(robotNames.length));
        }
    }

    RobotRuleSet parseRules(byte[] robotContent) {
        if (robotContent == null) {
            return EMPTY_RULES;
        }
        String content = new String(robotContent);
        StringTokenizer lineParser = new StringTokenizer(content, "\n\r");
        RobotRuleSet bestRulesSoFar = null;
        int bestPrecedenceSoFar = Integer.MAX_VALUE;
        RobotRuleSet currentRules = new RobotRuleSet();
        int currentPrecedence = Integer.MAX_VALUE;
        boolean addRules = false;
        boolean doneAgents = false;
        while (lineParser.hasMoreTokens()) {
            String line = lineParser.nextToken();
            int hashPos = line.indexOf("#");
            if (hashPos >= 0) {
                line = line.substring(0, hashPos);
            }
            if ((line = line.trim()).length() >= USER_AGENT.length() && line.substring(0, USER_AGENT.length()).equalsIgnoreCase(USER_AGENT)) {
                if (doneAgents) {
                    if (currentPrecedence < bestPrecedenceSoFar) {
                        bestPrecedenceSoFar = currentPrecedence;
                        bestRulesSoFar = currentRules;
                        currentPrecedence = Integer.MAX_VALUE;
                        currentRules = new RobotRuleSet();
                    }
                    addRules = false;
                }
                doneAgents = false;
                String agentNames = line.substring(line.indexOf(COLON) + 1);
                agentNames = agentNames.trim();
                StringTokenizer agentTokenizer = new StringTokenizer(agentNames);
                while (agentTokenizer.hasMoreTokens()) {
                    int precedence;
                    String agentName = agentTokenizer.nextToken().toLowerCase();
                    Integer precedenceInt = this._robotNames.get(agentName);
                    if (precedenceInt == null || (precedence = precedenceInt.intValue()) >= currentPrecedence || precedence >= bestPrecedenceSoFar) continue;
                    currentPrecedence = precedence;
                }
                if (currentPrecedence >= bestPrecedenceSoFar) continue;
                addRules = true;
                continue;
            }
            if (line.length() >= DISALLOW.length() && line.substring(0, DISALLOW.length()).equalsIgnoreCase(DISALLOW)) {
                doneAgents = true;
                String path = line.substring(line.indexOf(COLON) + 1);
                path = path.trim();
                try {
                    path = URLDecoder.decode(path, CHARACTER_ENCODING);
                }
                catch (UnsupportedEncodingException unsupportedEncodingException) {
                    LOG.warn((Object)("error parsing robots rules- can't decode path: " + path));
                }
                if (path.length() == 0) {
                    if (!addRules) continue;
                    currentRules.clearPrefixes();
                    continue;
                }
                if (!addRules) continue;
                currentRules.addPrefix(path, false);
                continue;
            }
            if (line.length() >= ALLOW.length() && line.substring(0, ALLOW.length()).equalsIgnoreCase(ALLOW)) {
                doneAgents = true;
                String path = line.substring(line.indexOf(COLON) + 1);
                if ((path = path.trim()).length() == 0) {
                    if (!addRules) continue;
                    currentRules.clearPrefixes();
                    continue;
                }
                if (!addRules) continue;
                currentRules.addPrefix(path, true);
                continue;
            }
            if (line.length() < CRAWL_DELAY.length() || !line.substring(0, CRAWL_DELAY.length()).equalsIgnoreCase(CRAWL_DELAY)) continue;
            doneAgents = true;
            long crawlDelay = -1L;
            String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
            if (delay.length() <= 0) continue;
            try {
                crawlDelay = Long.parseLong(delay) * 1000L;
            }
            catch (NumberFormatException exception) {
                LOG.info((Object)("can not parse Crawl-Delay:" + exception.toString()));
            }
            currentRules.setCrawlDelay(crawlDelay);
        }
        if (currentPrecedence < bestPrecedenceSoFar) {
            bestPrecedenceSoFar = currentPrecedence;
            bestRulesSoFar = currentRules;
        }
        if (bestPrecedenceSoFar == Integer.MAX_VALUE) {
            return EMPTY_RULES;
        }
        return bestRulesSoFar;
    }

    static RobotRuleSet getEmptyRules() {
        return EMPTY_RULES;
    }

    static RobotRuleSet getForbidAllRules() {
        RobotRuleSet rules = new RobotRuleSet();
        rules.addPrefix("", false);
        return rules;
    }

    private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {
        String host = url.getHost().toLowerCase();
        RobotRuleSet robotRules = CACHE.get(host);
        boolean cacheRule = true;
        if (robotRules == null) {
            if (LOG.isTraceEnabled()) {
                LOG.trace((Object)("cache miss " + url));
            }
            try {
                URL robotstxtUrl = new URL(url, "/robots.txt");
                Response response = http.getResponse(robotstxtUrl.toString());
                if (response.getCode() == 200) {
                    robotRules = this.parseRules(response.getContent());
                } else if (response.getCode() == 403 && !this._allowForbidden) {
                    robotRules = FORBID_ALL_RULES;
                } else if (response.getCode() >= 500) {
                    cacheRule = false;
                    robotRules = EMPTY_RULES;
                } else {
                    robotRules = EMPTY_RULES;
                }
            }
            catch (Exception exception) {
                LOG.info((Object)("Couldn't get robots.txt for " + url + ": " + exception.toString()));
                cacheRule = false;
                robotRules = EMPTY_RULES;
            }
            if (cacheRule) {
                CACHE.put(host, robotRules);
            }
        }
        return robotRules;
    }

    private RobotRuleSet getRobotRulesSet(String robotsFile, URL url) {
        String host = url.getHost().toLowerCase();
        RobotRuleSet robotRules = CACHE.get(host);
        boolean cacheRule = true;
        if (robotRules == null) {
            block11: {
                LOG.debug((Object)("Robotstxt cache miss " + url));
                FileInputStream robotsIn = null;
                try {
                    try {
                        robotsIn = new FileInputStream(robotsFile);
                        ArrayList<byte[]> bufs = new ArrayList<byte[]>();
                        byte[] buf = new byte[2048];
                        int totBytes = 0;
                        int rsize = robotsIn.read(buf);
                        while (rsize >= 0) {
                            totBytes += rsize;
                            if (rsize != 2048) {
                                byte[] tmp = new byte[rsize];
                                System.arraycopy(buf, 0, tmp, 0, rsize);
                                bufs.add(tmp);
                            } else {
                                bufs.add(buf);
                                buf = new byte[2048];
                            }
                            rsize = robotsIn.read(buf);
                        }
                        byte[] robotsBytes = new byte[totBytes];
                        int pos = 0;
                        int i = 0;
                        while (i < bufs.size()) {
                            byte[] currBuf = (byte[])bufs.get(i);
                            int currBufLen = currBuf.length;
                            System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
                            pos += currBufLen;
                            ++i;
                        }
                        robotRules = this.parseRules(robotsBytes);
                        LOG.debug((Object)("Rules from file:" + robotsFile));
                        LOG.debug((Object)robotRules);
                    }
                    catch (IOException iOException) {
                        LOG.error((Object)("Error reading robots.txt file: " + robotsFile));
                        cacheRule = false;
                        robotRules = EMPTY_RULES;
                        IOUtils.closeQuietly((InputStream)robotsIn);
                        break block11;
                    }
                }
                catch (Throwable throwable) {
                    IOUtils.closeQuietly(robotsIn);
                    throw throwable;
                }
                IOUtils.closeQuietly((InputStream)robotsIn);
            }
            if (cacheRule) {
                CACHE.put(host, robotRules);
            }
        }
        return robotRules;
    }

    public boolean isAllowed(HttpBase http, URL url) {
        String path = url.getPath();
        if (path == null || "".equals(path)) {
            path = "/";
        }
        if (this._policy.equals((Object)Robotstxt.IGNORE)) {
            return true;
        }
        if (this._policy.equals((Object)Robotstxt.CLASSIC)) {
            return this.getRobotRulesSet(http, url).isAllowed(path);
        }
        if (this._policy.equals((Object)Robotstxt.CUSTOM)) {
            return this.getRobotRulesSet(this._policyValue, url).isAllowed(path);
        }
        if (this._policy.equals((Object)Robotstxt.SET)) {
            String[] sd = this._policyValue.split(SEMICOLON);
            this.setRobotNames(sd);
            return this.getRobotRulesSet(http, url).isAllowed(path);
        }
        return this.getRobotRulesSet(http, url).isAllowed(path);
    }

    public long getCrawlDelay(HttpBase http, URL url) {
        return this.getRobotRulesSet(http, url).getCrawlDelay();
    }

    public static class RobotRuleSet {
        private List<RobotsEntry> _tmpEntries = new ArrayList<RobotsEntry>();
        private RobotsEntry[] _entries;
        private long _expireTime;
        private long _crawlDelay = -1L;

        private void addPrefix(String prefix, boolean allow) {
            if (this._tmpEntries == null) {
                this._tmpEntries = new ArrayList<RobotsEntry>();
                if (this._entries != null) {
                    int i = 0;
                    while (i < this._entries.length) {
                        this._tmpEntries.add(this._entries[i]);
                        ++i;
                    }
                }
                this._entries = null;
            }
            this._tmpEntries.add(new RobotsEntry(prefix, allow));
        }

        private void clearPrefixes() {
            if (this._tmpEntries == null) {
                this._tmpEntries = new ArrayList<RobotsEntry>();
                this._entries = null;
            } else {
                this._tmpEntries.clear();
            }
        }

        public void setExpireTime(long expireTime) {
            this._expireTime = expireTime;
        }

        public long getExpireTime() {
            return this._expireTime;
        }

        public long getCrawlDelay() {
            return this._crawlDelay;
        }

        public void setCrawlDelay(long crawlDelay) {
            this._crawlDelay = crawlDelay;
        }

        public boolean isAllowed(String path) {
            try {
                path = URLDecoder.decode(path, RobotRulesParser.CHARACTER_ENCODING);
            }
            catch (UnsupportedEncodingException unsupportedEncodingException) {
                LOG.debug((Object)("Couldn't decode the path specified: " + path));
            }
            if (this._entries == null) {
                this._entries = new RobotsEntry[this._tmpEntries.size()];
                this._entries = this._tmpEntries.toArray(this._entries);
                this._tmpEntries = null;
            }
            int pos = 0;
            int end = this._entries.length;
            while (pos < end) {
                if (path.startsWith(this._entries[pos].getPrefix())) {
                    return this._entries[pos].isAllowed();
                }
                ++pos;
            }
            return true;
        }

        public String toString() {
            this.isAllowed("x");
            StringBuffer buf = new StringBuffer();
            int i = 0;
            while (i < this._entries.length) {
                if (this._entries[i].isAllowed()) {
                    buf.append("Allow: " + this._entries[i].getPrefix() + System.getProperty("line.separator"));
                } else {
                    buf.append("Disallow: " + this._entries[i].getPrefix() + System.getProperty("line.separator"));
                }
                ++i;
            }
            return buf.toString();
        }

        private class RobotsEntry {
            private final String _prefix;
            private final boolean _allowed;

            RobotsEntry(String prefix, boolean allowed) {
                this._prefix = prefix;
                this._allowed = allowed;
            }

            public String getPrefix() {
                return this._prefix;
            }

            public boolean isAllowed() {
                return this._allowed;
            }
        }
    }
}

