//////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2010, 2022 Contributors to the Eclipse Foundation
//
// See the NOTICE file(s) distributed with this work for additional
// information regarding copyright ownership.
//
// This program and the accompanying materials are made available
// under the terms of the MIT License which is available at
// https://opensource.org/licenses/MIT
//
// SPDX-License-Identifier: MIT
//////////////////////////////////////////////////////////////////////////////

package org.eclipse.escet.setext.runtime;

import static org.eclipse.escet.common.app.framework.output.OutputProvider.out;
import static org.eclipse.escet.common.java.Lists.list;
import static org.eclipse.escet.common.java.Strings.fmt;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;

import org.eclipse.escet.common.java.Strings;
import org.eclipse.escet.common.java.TextPosition;
import org.eclipse.escet.setext.runtime.exceptions.ScanException;
import org.eclipse.escet.setext.runtime.exceptions.SyntaxException;

/** Base class for scanners generated by SeText. */
public abstract class Scanner {
    /**
     * The singleton token instance indicating a skipped token.
     *
     * @see #shouldSkipToken
     */
    private static final Token SKIPPED_TOKEN = new Token("<skipped>", -1, null);

    /** The reader to use to read code points from the input. */
    private CodePointReader reader;

    /**
     * The location of the source file being scanned. Must be an absolute local file system path, with platform specific
     * path separators. The path does not have to refer to an existing file. That is, it may not be assumed that a file
     * with that path actually exists on disk.
     */
    protected String location;

    /**
     * The source of the input data. This text is prefixed to exception messages. May be {@code null} to indicate no
     * source information is available.
     */
    protected String src;

    /** The buffer to use to store code points in, which are not yet part of a fully accepted token. */
    private CodePointBuffer buffer;

    /** The 0-based index (inclusive) into the input, of the start of the current token. */
    protected int startOffset;

    /** The 1-based index (inclusive) into the input, of the start line of the current token. */
    protected int startLine;

    /** The 1-based index (inclusive) into the input, of the start column of the current token. */
    protected int startColumn;

    /**
     * The 0-based index (inclusive) into the input, of the last accepted position for the current token (the current
     * longest match). Value {@code -1} indicates that we have not yet found any match for the current token.
     *
     * <p>
     * Note that: {@code acceptOffset == -1 || startOffset <= acceptOffset}
     * </p>
     */
    protected int acceptOffset;

    /**
     * The 1-based index (inclusive) into the input, of the line of the last accepted position for the current token
     * (the current longest match). Value {@code -1} indicates that we have not yet found any match for the current
     * token.
     */
    protected int acceptLine;

    /**
     * The 1-based index (inclusive) into the input, of the column of the last accepted position for the current token
     * (the current longest match). Value {@code -1} indicates that we have not yet found any match for the current
     * token.
     */
    protected int acceptColumn;

    /**
     * The 0-based index (inclusive) into the input, of the current offset (the current position to consider for
     * scanning).
     *
     * <p>
     * Note that:
     * <ul>
     * <li>{@code startOffset <= curOffset}</li>
     * <li>{@code acceptOffset == -1 || acceptOffset <= curOffset}</li>
     * </ul>
     * </p>
     */
    protected int curOffset;

    /** The 1-based index (inclusive) into the input, of the current line. */
    protected int curLine;

    /** The 1-based index (inclusive) into the input, of the current column. */
    protected int curColumn;

    /**
     * The unique id for the terminal accepted at position {@link #acceptOffset}. Only valid if
     * {@code acceptOffset != -1}. The unique ids correspond to indices into {@link #terminals}.
     */
    protected int accept;

    /**
     * The unique id for the current scanner state. Note that the scanner state does <em>not</em> refer to the state of
     * the scanner's DFA, but to the states defined in the SeText specification.
     *
     * <p>
     * The default state is always state {@code 0}, for any scanner.
     * </p>
     */
    protected int scannerState;

    /** Whether to output debug information for the scanner. */
    protected boolean debugScanner;

    /**
     * Whether to optimize the scanner to skip tokens if they are not needed.
     *
     * @see #shouldSkipToken
     */
    protected boolean optimizeScanner;

    /** Textual representations of the scanner states, for debugging. */
    protected String[] scannerStates;

    /**
     * For each terminal, whether it needs post processing.
     *
     * @see #tokenAccepted
     */
    protected boolean[] terminalNeedsPost;

    /** Textual representations of the terminals, for debugging. */
    protected String[] terminals;

    /** Names of the terminals (may be {@code null}), for exceptions. */
    protected String[] terminalNames;

    /** Descriptions of the terminals (may be {@code null}), for exceptions. */
    protected String[] terminalDescriptions;

    /**
     * Initializes (or re-initializes) the scanner.
     *
     * @param reader The reader to use to read code points from the input.
     * @param location The location of the source file being scanned. Must be an absolute local file system path, with
     *     platform specific path separators. The path does not have to refer to an existing file.
     * @param src The source of the input data. This text is prefixed to exception messages. May be {@code null} to
     *     indicate no source\ information is available.
     * @param debug The debug mode.
     * @param optimize Whether to optimize the scanner to skip tokens if they are not needed.
     */
    public void initScanner(CodePointReader reader, String location, String src, DebugMode debug, boolean optimize) {
        optimizeScanner = optimize;
        debugScanner = debug == DebugMode.SCANNER || debug == DebugMode.BOTH;

        if (debugScanner) {
            out("%s: Scanning...", getClass().getSimpleName());
        }

        this.location = location;
        this.src = src;
        this.reader = reader;
        buffer = new CodePointBuffer();

        startOffset = 0;
        startLine = 1;
        startColumn = 1;

        acceptOffset = -1;
        acceptLine = -1;
        acceptColumn = -1;

        curOffset = 0;
        curLine = 1;
        curColumn = 1;

        accept = -1;

        scannerState = 0;
    }

    /**
     * Returns the next code point to consider for scanning. If there is data in the {@link #buffer}, it is returned. If
     * not enough data is available in the buffer, the buffer is first extended with more data from the input.
     *
     * @return The next code point to consider for scanning.
     * @throws IOException If an I/O error occurred.
     */
    protected int getNextCodePoint() throws IOException {
        if (!buffer.canRead()) {
            buffer.add(reader.read());
        }
        return buffer.read();
    }

    /**
     * Scans the input and returns the next non-skipped token recognized.
     *
     * @return The next token recognized.
     * @throws IOException If an I/O error occurred.
     * @throws SyntaxException If scanning failed.
     */
    public Token nextToken() throws IOException {
        while (true) {
            Token token = nextTokenInternal();
            if (token == SKIPPED_TOKEN) {
                continue;
            }
            return token;
        }
    }

    /**
     * Scans the input and returns the next skipped or non-skipped token recognized.
     *
     * @return The next token recognized, or {@link #SKIPPED_TOKEN} if {@link #optimizeScanner} is enabled and the token
     *     is to be {@link #shouldSkipToken skipped}.
     * @throws IOException If an I/O error occurred.
     * @throws SyntaxException If scanning failed.
     */
    public abstract Token nextTokenInternal() throws IOException;

    /**
     * Accepts the last accepted position (the longest match), if any. Indicates scanning failure otherwise.
     *
     * @return A newly constructed token for the longest match, or {@link #SKIPPED_TOKEN} if the token is
     *     {@link #shouldSkipToken skipped}.
     * @throws SyntaxException If scanning failed (no match for the current token).
     */
    protected Token acceptOrError() {
        if (acceptOffset == -1) {
            // No longest match available. Scanning failed.
            TextPosition pos = new TextPosition(location, src, curLine, curColumn, curLine, curColumn, curOffset,
                    curOffset);

            buffer.unread(1);
            int codePoint = buffer.read();

            throw new ScanException(codePoint, pos);
        }

        // Determine whether to skip the token.
        boolean skip = shouldSkipToken(accept);

        // Accept longest match. Unread the part of the input that is not
        // part of the match, get the accepted text, set the current offset
        // just after the end of the match, and update the line/column tracking
        // information correspondingly.
        buffer.unread(curOffset - acceptOffset);
        int txtLength = acceptOffset - startOffset + 1;
        Character lastChar = null;
        String txt = null;
        if (skip) {
            lastChar = buffer.removePrefix(txtLength);
        } else {
            txt = buffer.pollPrefix(txtLength);
        }
        curOffset = acceptOffset + 1;
        curLine = acceptLine;
        curColumn = acceptColumn + 1;
        if (lastChar != null && lastChar == '\n') {
            curLine++;
            curColumn = 1;
        }
        if (txt != null && txt.endsWith("\n")) {
            curLine++;
            curColumn = 1;
        }

        // Skip token, if possible.
        if (skip) {
            return SKIPPED_TOKEN;
        }

        // Construct position information for the match.
        TextPosition pos = new TextPosition(location, src, startLine, startColumn, acceptLine, acceptColumn,
                startOffset, acceptOffset);

        // Construct token.
        Token token = new Token(txt, accept, pos);

        // Post-process accepted token.
        if (terminalNeedsPost[accept]) {
            tokenAccepted(token);
        }

        // Debug output for accepted token.
        if (debugScanner) {
            debugScanner(token);
        }

        // Return the token.
        return token;
    }

    /**
     * Should the recognized token be skipped by the scanner?
     *
     * <p>
     * Tokens are only skipped if none of the following conditions hold:
     * <ul>
     * <li>Token optimization is disabled. See {@link #optimizeScanner}.</li>
     * <li>The token is a named token. See {@link #terminalNames}.</li>
     * <li>The token is an end-of-file token. See {@link #buffer} and {@link CodePointBuffer#peekEndOfFile}.</li>
     * <li>The token needs to be post processed. See {@link #terminalNeedsPost}.</li>
     * <li>Scanner debug output is enabled. See {@link #debugScanner}.</li>
     * </ul>
     * </p>
     *
     * @param id The id of the recognized terminal.
     * @return {@code true} if the token should be skipped, {@code false} otherwise.
     */
    private boolean shouldSkipToken(int id) {
        // We only skip if we optimize.
        if (!optimizeScanner) {
            return false;
        }

        // We never skip tokens for named terminals.
        if (terminalNames[accept] != null) {
            return false;
        }

        // We never skip tokens for end-of-file terminals.
        if (buffer.peekEndOfFile()) {
            return false;
        }

        // We never skip tokens for terminals that need post processing.
        if (terminalNeedsPost[accept]) {
            return false;
        }

        // We never skip tokens if scanner debugging is enabled.
        if (debugScanner) {
            return false;
        }

        // Skip the token.
        return true;
    }

    /**
     * Callback method invoked whenever a non-skipped token is accepted.
     *
     * <p>
     * The default implementation does nothing. Derived classes may override it to:
     * <ul>
     * <li>change the scanner state based on the accepted terminal,</li>
     * <li>call the appropriate post-processing method for the terminal,</li>
     * <li>etc.</li>
     * </ul>
     * </p>
     *
     * <p>
     * Method is only invoked by scanner if the token needs post processing, see {@link #terminalNeedsPost}.
     * </p>
     *
     * @param token The accepted token. The scanned text may be modified in-place.
     */
    protected void tokenAccepted(Token token) {
        // By default, does nothing.
    }

    /**
     * Outputs debugging information for the scanner, for a processed code point. Must only be called if
     * {@link #debugScanner} is {@code true}.
     *
     * @param codePoint The processed code point, or {@code -1} for end-of-file.
     * @param state The new DFA state of the scanner, or {@code -1} if not available or applicable.
     */
    protected void debugScanner(int codePoint, int state) {
        String codePointTxt;
        if (codePoint == -1) {
            codePointTxt = "<eof>";
        } else {
            codePointTxt = Strings.codePointToStr(codePoint);
            codePointTxt = Strings.stringToJava(codePointTxt);
            codePointTxt += fmt(" (Unicode U+%s)", Integer.toHexString(codePoint));
        }

        String acceptTxt;
        if (accept == -1) {
            acceptTxt = "";
        } else {
            acceptTxt = terminals[accept];
            acceptTxt = fmt(" (accept %d=%s)", accept, acceptTxt);
        }

        out("%s: Scanned text: %s @ line %d, column %d%s", getClass().getSimpleName(), codePointTxt, curLine, curColumn,
                acceptTxt);
    }

    /**
     * Outputs debugging information for the scanner, for an accepted token. Must only be called if
     * {@link #debugScanner} is {@code true}.
     *
     * @param token The accepted token.
     */
    private void debugScanner(Token token) {
        String txt = token.isEof() ? "<eof>" : Strings.stringToJava(token.text);

        String posTxt = fmt("%d:%d-%d:%d", token.position.startLine, token.position.startColumn, token.position.endLine,
                token.position.endColumn);

        String stateTxt = scannerStates[scannerState];

        String terminalTxt = terminals[token.id];

        out("%s: Accepted token: %s @ %s (state=%d=\"%s\") (terminal=%d=%s)", getClass().getSimpleName(), txt, posTxt,
                scannerState, stateTxt, token.id, terminalTxt);
    }

    /**
     * Scans some input to a sequence of tokens. Does not provide source information for scanner exceptions.
     *
     * @param input The input to scan.
     * @param location The location of the source file being scanned. Must be an absolute local file system path, with
     *     platform specific path separators. The path does not have to refer to an existing file.
     * @param debug The debug mode to use.
     * @param optimize Whether to optimize the scanner to skip tokens if they are not needed.
     * @return The sequence of tokens.
     * @throws IOException If an I/O error occurred.
     * @throws SyntaxException If scanning failed.
     */
    public List<Token> scanString(String input, String location, DebugMode debug, boolean optimize) throws IOException {
        // Construct input reader. No need to buffer in-memory strings.
        CodePointReader reader = new CodePointReader(new StringReader(input), false);

        // Construct scanner.
        initScanner(reader, location, null, debug, optimize);

        // Keep scanning tokens until end-of-file.
        List<Token> tokens = list();
        while (true) {
            Token token = nextToken();
            tokens.add(token);
            if (token.isEof()) {
                break;
            }
        }

        // Return scanned tokens, including the end-of-file token.
        return tokens;
    }
}
