tok.t

documentation
#charset "us-ascii"

/*
 *   Tokenizer - customizable tokenizer class for use with the intrinsic
 *   class 'grammar-production' parser.
 *   
 *   This tokenizer implementation is parameterized with a set of rules
 *   (see below); a basic set of rules is provided, but users can
 *   customize the tokenizer quite extensively simply by subclassing the
 *   Tokenizer class and overriding the 'rules_' property with a new set
 *   of rules declarations.  
 */

#include "tads.h"
#include "t3.h"
#include "dict.h"
#include "tok.h"
#include "vector.h"

/* ------------------------------------------------------------------------ */
/*
 *   Tokenizer exceptions 
 */

/*
 *   base class for all tokenizer errors (to allow blanket 'catch') 
 */
class TokenizerError: Exception
    displayException() { "Tokenizer exception"; }
;

/*
 *   no match for token 
 */
class TokErrorNoMatch: TokenizerError
    construct(str)
    {
        /* remember the full remaining text */
        remainingStr_ = str;

        /* 
         *   for convenience, separately remember the single character
         *   that we don't recognize - this is simply the first character
         *   of the rest of the line 
         */
        curChar_ = str.substr(1, 1);
    }

    displayException()
        { "Tokenizer error: unexpected character '<<curChar_>>'"; }

    /* 
     *   The remainder of the string.  This is the part that couldn't be
     *   matched; we were successful in matching up to this point. 
     */
    remainingStr_ = nil

    /* current character (first character of remainingStr_) */
    curChar_ = nil
;

/* ------------------------------------------------------------------------ */
/*
 *   Basic token types
 */

/* word */
enum token tokWord;

/* quoted string */
enum token tokString;

/* punctuation */
enum token tokPunct;

/* integer number */
enum token tokInt;


/* ------------------------------------------------------------------------ */
/*
 *   Tokenizer base class
 */
class Tokenizer: object
    /*
     *   Tokenizing rules.  The subclass can override this to specify a
     *   list that defines different tokenization rules.  Each entry in the
     *   master rules_ list is one rule.  Each rule is a list consisting of
     *   the name of the rule; the pattern to match for the rule; the token
     *   type (an 'enum token') to use when the rule is matched; the value
     *   computation rule; and the value test rule.
     *   
     *   The name of a rule is just an arbitrary string to identify the
     *   rule.  This can be used to insert new rules in order relative to
     *   known existing rules, or to delete known existing rules.
     *   
     *   If the value computation rule is nil, we'll just use the matching
     *   text as the token value.  If the value rule is a string, we'll use
     *   the string as a replacement pattern (with rexReplace).  If it's a
     *   property ID, we'll invoke the property of self with the following
     *   arguments:
     *   
     *   txt, typ, toks
     *   
     *   'txt' is the matched text; 'typ' is the token type from the rule;
     *   and 'toks' is a vector to which the new token or tokens are to be
     *   added.  The routine is responsible for adding the appropriate
     *   values to the result list.  Note that the routine can add more
     *   than one token to the results if desired.
     *   
     *   If the value test rule is non-nil, it must be either a method or a
     *   function; we'll call the method or function to test to see if the
     *   matched value is valid.  We'll call the method (on self) with the
     *   matching text as the argument; if the method returns true, the
     *   rule matches, otherwise the rule fails, and we'll continue looking
     *   for another rule as though we hadn't matched the rule's regular
     *   expression in the first place.  This can be used for rules that
     *   require more than a simple regular expression match; for example,
     *   the value test can be used to look up the match in a dictionary,
     *   so that the rule only matches tokens that are defined in the
     *   dictionary.  
     */
    rules_ = static
    [
        /* skip whitespace */
        ['whitespace', R'<Space>+', nil, &tokCvtSkip, nil],

        /* certain punctuation marks */
        ['punctuation', R'[.,;:?!]', tokPunct, nil, nil],

        /* 
         *   Words - note that we convert everything to lower-case.  A
         *   word must start with an alphabetic character, but can contain
         *   alphabetics, digits, hyphens, and apostrophes after that. 
         */
        ['word', R'<Alpha>(<AlphaNum>|[-\'])*', tokWord, &tokCvtLower, nil],

        /* strings */
        ['string single-quote', R'\'(.*)\'', tokString, nil, nil],
        ['string double-quote', R'"(.*)"', tokString, nil, nil],

        /* integer numbers */
        ['integer', R'[0-9]+', tokInt, nil, nil]
    ]

    /*
     *   Insert a new rule before or after the existing rule with the name
     *   'curName'.  If 'curName' is nil, or rule is found with the given
     *   name, we'll insert the new rule at the end of the list.  'rule'
     *   must be a list with the standard elements for a tokenizer rule.
     *   'after' is nil to insert the new rule before the given existing
     *   rule, true to insert after it.  
     */
    insertRule(rule, curName, after)
    {
        local idx;

        /* 
         *   if the name of an existing rule was supplied, find the
         *   existing rule with the given name 
         */
        idx = nil;
        if (curName != nil)
            idx = rules_.indexWhich({x: tokRuleName(x) == curName});

        /* if we didn't find curName, insert at the end of the list */
        if (idx == nil)
            idx = rules_.length();

        /* if we're inserting after the given element, adjust the index */
        if (after)
            ++idx;

        /* insert the new rule */
        insertRuleAt(rule, idx);
    }

    /* 
     *   Insert a rule at the given index in our rules list.  'rule' must
     *   be a list with the standard elements for a tokenizer rule.  'idx'
     *   is the index of the new rule; we'll insert before the existing
     *   element at this index; so if 'idx' is 1, we'll insert before the
     *   first existing rule.  
     */
    insertRuleAt(rule, idx)
    {
        /* insert the rule */
        rules_ = rules_.insertAt(idx, rule);
    }

    /*
     *   Delete a rule by name.  This finds the rule with the given name
     *   and removes it from the list. 
     */
    deleteRule(name)
    {
        local idx;
        
        /* find the rule with the given name */
        idx = rules_.indexWhich({x: tokRuleName(x) == name});

        /* if we found the named element, remove it from the list */
        if (idx != nil)
            deleteRuleAt(idx);
    }

    /* delete the rule at the given index */
    deleteRuleAt(idx)
    {
        /* delete the rule */
        rules_ = rules_.removeElementAt(idx);
    }

    /* convert a string to lower-case (for value computation rules) */
    tokCvtLower(txt, typ, toks)
    {
        /* add the lower-cased version of the string to the result list */
        toks.append([txt.toLower(), typ, txt]);
    }

    /* 
     *   processing routine to skip a match - this is used for whitespace
     *   and other text that does not result in any tokens in the result
     *   list 
     */
    tokCvtSkip(txt, typ, toks)
    {
        /* simply skip the text without generating any new tokens */
    }

    /*
     *   Tokenize a string.  If we find text that we can't match to any of
     *   the rules, we'll throw an exception (TokErrorNoMatch).  If we
     *   succeed in tokenizing the entire string, we'll return a list with
     *   one element per token.  Each element of the main list is a
     *   sublist with the following elements describing a token:
     *   
     *   - The first element gives the token's value.
     *   
     *   - The second element the token type (given as a token type enum
     *   value).
     *   
     *   - The third element the original token strings, before any
     *   conversions or evaluations were performed.  For example, this
     *   maintains the original case of strings that are lower-cased for
     *   the corresponding token values.
     */
    tokenize(str)
    {
        local toks = new Vector(32);
        local startIdx = 1;
        local len = str.length();
        
        /* keep going until we run out of string */
    mainLoop:
        while (startIdx <= len)
        {
            /* run through the rules in sequence until we match one */
        ruleLoop:
            for (local i = 1, local cnt = rules_.length() ; i <= cnt ; ++i)
            {
                local cur;
                local match;
                local val;
                        
                /* get the current rule */
                cur = rules_[i];

                /* check for a match to the rule's pattern */
                match = rexMatch(tokRulePat(cur), str, startIdx);
                if (match != nil && match > 0)
                {
                    local test;
                    local txt;
                    local typ;

                    /* get the matching text */
                    txt = str.substr(startIdx, match);

                    /* 
                     *   if there's a value test, invoke it to determine
                     *   if the token really matches 
                     */
                    if ((test = tokRuleTest(cur)) != nil)
                    {
                        local accept;

                        /* check what kind of test function we have */
                        switch (dataType(test))
                        {
                        case TypeFuncPtr:
                        case TypeObject:
                            /* it's a function or anonymous function */
                            accept = (test)(txt);
                            break;

                        case TypeProp:
                            /* it's a method */
                            accept = self.(test)(txt);
                            break;

                        default:
                            /* consider anything else to be accepted */
                            accept = true;
                            break;
                        }

                        /* 
                         *   if the value test failed, it means that the
                         *   token doesn't match this rule after all -
                         *   ignore the regex match and keep searching for
                         *   another rule 
                         */
                        if (!accept)
                            continue ruleLoop;
                    }

                    /* get the type of the token from the rule */
                    typ = tokRuleType(cur);
                    
                    /* get this value processing rule */
                    val = tokRuleVal(cur);

                    /* determine what value to use */
                    switch(dataTypeXlat(val))
                    {
                    case TypeNil:
                        /* use the matching text verbatim */
                        toks.append([txt, typ, txt]);
                        break;
                        
                    case TypeProp:
                        /* 
                         *   invoke the property - it's responsible for
                         *   adding the token or tokens to the results
                         *   lists 
                         */
                        self.(val)(txt, typ, toks);
                        break;
                        
                    case TypeSString:
                        /* it's a regular expression replacement */
                        toks.append(
                            [rexReplace(tokRulePat(cur),
                                        txt, val, ReplaceOnce),
                             typ, txt]);
                        break;

                    case TypeFuncPtr:
                        /* invoke the function */
                        (val)(txt, typ, toks);
                        break;

                    default:
                        /* 
                         *   use any other value exactly as given in
                         *   the rule 
                         */
                        toks.append([val, typ, txt]);
                        break;
                    }

                    /* 
                     *   continue the search at the next character after
                     *   the end of this token 
                     */
                    startIdx += match;

                    /* start over with the rest of the string */
                    continue mainLoop;
                }
            }

            /*
             *   We failed to find a match for this part of the string.
             *   Throw an exception and let the caller figure out what to
             *   do.  The exception parameter gives the rest of the
             *   string, so the caller can display a suitable error
             *   message if desired.  
             */
            throw new TokErrorNoMatch(str.substr(startIdx));
        }

        /* we're done with the string - return out value and type lists */
        return toks.toList();
    }
;

/* ------------------------------------------------------------------------ */
/*
 *   Test Section 
 */

#ifdef TOK_TEST

main(args)
{
    "Enter text to tokenize.  Type Q or QUIT when done. ";
    for (;;)
    {
        local str, toks;

        /* read a string */
        "\b>";
        str = inputLine();

        /* catch tokenization errors */
        try
        {
            /* tokenize the string */
            toks = Tokenizer.tokenize(str);

            /* if the first token is 'quit', we're done */
            if (toks.length() > 0
                && getTokType(toks[1]) == tokWord
                && (getTokVal(toks[1])== 'quit' || getTokVal(toks[1]) == 'q'))
            {
                /* they want to stop - exit the command loop */
                break;
            }

            /* display the tokens */
            for (local i = 1, local cnt = toks.length() ; i <= cnt ; ++i)
                "(<<getTokVal(toks[i])>>) ";
        }
        catch (TokErrorNoMatch err)
        {
            "Unrecognized punctuation: <<err.remainingStr_.substr(1, 1)>>";
        }
    }
}

#endif /* TOK_TEST */

TADS 3 Library Manual
Generated on 5/16/2013 from TADS version 3.1.3