/*
 *   Tokenizer - customizable tokenizer class for use with the intrinsic
 *   class 'grammar-production' parser.
 *   
 *   This tokenizer implementation is parameterized with a set of rules
 *   (see below); a basic set of rules is provided, but users can
 *   customize the tokenizer quite extensively simply by subclassing the
 *   Tokenizer class and overriding the 'rules_' property with a new set
 *   of rules declarations.  
 */

#include "tads.h"
#include "t3.h"
#include "dict.h"
#include "tok.h"

/* ------------------------------------------------------------------------ */
/*
 *   Tokenizer exceptions 
 */

/*
 *   base class for all tokenizer errors (to allow blanket 'catch') 
 */
class TokenizerError: Exception;

/*
 *   no match for token 
 */
class TokErrorNoMatch: TokenizerError
    construct(str) { remainingStr_ = str; }

    /* 
     *   The remainder of the string.  This is the part that couldn't be
     *   matched; we were successful in matching up to this point. 
     */
    remainingStr_ = nil
;

/* ------------------------------------------------------------------------ */
/*
 *   Basic token types
 */

/* word */
enum token tokWord;

/* quoted string */
enum token tokString;

/* punctuation */
enum token tokPunct;

/* integer number */
enum token tokInt;


/* ------------------------------------------------------------------------ */
/*
 *   Some internal convenience macros 
 */
#define tokRulePat(rule)    (rule[1])
#define tokRuleType(rule)   (rule[2])
#define tokRuleFlags(rule)  (rule[3])
#define tokRuleVal(rule)    (rule[4])


/* ------------------------------------------------------------------------ */
/*
 *   Tokenizer base class
 */
class Tokenizer: object
    /*
     *   Tokenizing rules.  The subclass can override this to specify a
     *   list that defines different tokenization rules.  Each entry in
     *   the master rules_ list is one rule.  Each rule is a list
     *   consisting of the pattern to match for the rule; the token type
     *   (an 'enum token') to use when the rule is matched; flags; and the
     *   value computation rule.
     *   
     *   If the value computation rule is nil, we'll just use the matching
     *   text as the token value.  If the value rule is a string, we'll
     *   use the string as a replacement pattern (with rexReplace).  If
     *   it's a property ID, we'll invoke the property of self with the
     *   matching text as the argument.
     */
    rules_ =
    [
        /* skip whitespace */
        ['[ \t]+', nil, TOKFLAG_SKIP, nil],

        /* punctuation marks */
        ['[.,;:?!]', tokPunct, 0, nil],

        /* words - note that we convert everything to lower-case */
        ['[a-zA-Z][-\'a-zA-Z0-9]*', tokWord, 0, &tokCvtLower],

        /* strings */
        ['[\'"](.*)[\'"]', tokString, 0, nil],

        /* integer numbers */
        ['[0-9]+', tokInt, 0, nil]
    ]

    /* convert a string to lower-case (for value computation rules) */
    tokCvtLower(str) { return str.toLower(); }

    /*
     *   Tokenize a string.  If we find text that we can't match to any of
     *   the rules, we'll throw an exception (TokErrorNoMatch).  If we
     *   succeed in tokenizing the entire string, we'll return a list with
     *   two elements.  The first element is a list of the token values
     *   that we matched, and the second element is a list of the token
     *   types.  The two sublists will always have equal length, and a
     *   given element of the first list contains information on the same
     *   token as the element at the same index in the second list.  
     */
    tokenize(str)
    {
        local tokVals = [];
        local tokTypes = [];
        
        /* keep going until we run out of string */
    mainLoop:
        while (str != '')
        {
            /* run through the rules in sequence until we match one */
            for (local i = 1, local cnt = rules_.length() ; i <= cnt ; ++i)
            {
                local cur;
                local match;

                /* get the current rule */
                cur = rules_[i];

                /* check for a match to the rule's pattern */
                match = rexMatch(tokRulePat(cur), str);
                if (match != nil && match > 0)
                {
                    /* 
                     *   it's a match - if this isn't a SKIP rule, add the
                     *   token 
                     */
                    if (!(tokRuleFlags(cur) & TOKFLAG_SKIP))
                    {
                        local val;
                        local txt;
                        
                        /* add the type to the type list */
                        tokTypes += tokRuleType(cur);

                        /* get this value rule */
                        val = tokRuleVal(cur);

                        /* get the matching text */
                        txt = str.substr(1, match);

                        /* determine what value to use */
                        switch(dataType(val))
                        {
                        case TYPE_NIL:
                            /* use the matching text */
                            tokVals += txt;
                            break;
                                
                        case TYPE_PROP:
                            /* invoke the property */
                            tokVals += self.(val)(txt);
                            break;

                        case TYPE_SSTRING:
                            /* it's a regular expression replacement */
                            tokVals += rexReplace(tokRulePat(cur),
                                                 txt, val, REPLACE_ONCE);
                            break;

                        default:
                            /* use any other value as given */
                            tokVals += tokRuleVal(cur);
                            break;
                        }
                    }

                    /* remove the matched text from the string */
                    str = str.substr(match + 1);

                    /* start over with the rest of the string */
                    continue mainLoop;
                }
            }

            /*
             *   We failed to find a match for this part of the string.
             *   Throw an exception and let the caller figure out what to
             *   do.  The exception parameter gives the rest of the
             *   string, so the caller can display a suitable error
             *   message if desired.  
             */
            throw new TokErrorNoMatch(str);
        }

        /* we're done with the string - return out value and type lists */
        return [tokVals, tokTypes];
    }
;

/* ------------------------------------------------------------------------ */
/*
 *   Test Section 
 */

#ifdef TOK_TEST

main(args)
{
    "Enter text to tokenize.  Type Q or QUIT when done. ";
    for (;;)
    {
        local str, toks;

        /* read a string */
        "\b>";
        str = inputLine();

        /* catch tokenization errors */
        try
        {
            /* tokenize the string */
            toks = Tokenizer.tokenize(str);

            /* if the first token is 'quit', we're done */
            if (toks.length() > 0
                && toks[2][1] == tokWord
                && (toks[1][1] == 'quit' || toks[1][1] == 'q'))
            {
                /* they want to stop - exit the command loop */
                break;
            }

            /* display the tokens */
            for (local i = 1, local cnt = toks[1].length() ; i <= cnt ; ++i)
                "(<<toks[1][i]>>) ";
        }
        catch (TokErrorNoMatch err)
        {
            "Unrecognized punctuation: <<err.remainingStr_.substr(1, 1)>>";
        }
    }
}

#endif /* TOK_TEST */

