// Mini C scanner by Robert van Engelen
// A simple one-pass, syntax-directed translation of mini C to JVM bytecode
// Requires minic.l, minic.y, minic.hpp
// See minicdemo.c for a description of the mini C features

%top{
  #include "parser.hpp"   // generated by bison from minic.y
  #include "location.hpp" // generated by bison %locations
}

// lexer syntax: enable free-space mode regular expressions for clarity
%option freespace

// lexer optimization: generate fast scanner in direct code
%option fast

// lexer class: yy::Scanner
%option namespace=yy
%option lexer=Scanner

// lexer output files
%option outfile=scanner.cpp
%option header-file=scanner.hpp

// lexer errors: throw an exception in the scanner's default rule
%option exception="yy::Parser::syntax_error(location(), \"Unknown token.\")"

// parser integration: output code for the bison complete with locations parser
%option bison-complete
%option bison-locations
%option bison-cc-namespace=yy
%option bison-cc-parser=Parser

// Scanner class
%class{

 public:

  // lookup/insert a symbol in the "symbol table"
  ID symbol(const char *str)
  {
    return &*symbols.insert(str).first;
  }

 private:

  struct Keyword
  {
    const char *name;
    int         token;
  };

  // translate keyword to the corresponding token or return 0 if not a keyword
  int keyword_token(const char *str)
  {
    static const Keyword keywords[] =
    {
      { "break",    Parser::token::TOKEN_BREAK },
      { "case",     Parser::token::TOKEN_CASE },
      { "continue", Parser::token::TOKEN_CONTINUE },
      { "default",  Parser::token::TOKEN_DEFAULT },
      { "do",       Parser::token::TOKEN_DO },
      { "else",     Parser::token::TOKEN_ELSE },
      { "false",    Parser::token::TOKEN_FALSE },
      { "float",    Parser::token::TOKEN_FLOAT },
      { "for",      Parser::token::TOKEN_FOR },
      { "if",       Parser::token::TOKEN_IF },
      { "int",      Parser::token::TOKEN_INT },
      { "main",     Parser::token::TOKEN_MAIN },
      { "new",      Parser::token::TOKEN_NEW },
      { "print",    Parser::token::TOKEN_PRINT },
      { "println",  Parser::token::TOKEN_PRINTLN },
      { "return",   Parser::token::TOKEN_RETURN },
      { "string",   Parser::token::TOKEN_STRING },
      { "switch",   Parser::token::TOKEN_SWITCH },
      { "true",     Parser::token::TOKEN_TRUE },
      { "void",     Parser::token::TOKEN_VOID },
      { "while",    Parser::token::TOKEN_WHILE },
      { NULL,       0 }
    };

    for (const Keyword *keyword = keywords; keyword->name != NULL; ++keyword)
      if (strcmp(keyword->name, str) == 0)
        return keyword->token;

    return 0;
  }

  // lookup/insert matched text() in the "symbol table" matching an {identifier}
  ID symbol()
  {
    return symbol(text());
  }

  // lookup/insert matched text() int the "symbol table" matching a quoted {string}
  CS string()
  {
    return symbols.insert(translate_escapes()).first->c_str();
  }

  // translate \a, \b, \t, \n, \v, \f, \r, \\, and \" in matched text() matching {string}
  std::string translate_escapes()
  {
    std::string t;
    const char *s = matcher().begin() + 1;
    const char *e = matcher().end() - 1;
    while (s < e)
    {
      if (*s == '\\')
      {
        ++s;
        static const char *escapes = "abtnvfr";
        const char *esc = strchr(escapes, *s);
        if (esc != NULL)
          t.push_back(esc - escapes + '\a');
        else
          t.push_back(*s);
      }
      else
      {
        t.push_back(*s);
      }
      ++s;
    }
    return t;
  }

  // return token of matched text() matching {identifier}
  Parser::symbol_type ID()
  {
    int token = keyword_token(text());
    return token ? Parser::symbol_type(token, location()) : Parser::make_ID(symbol(), location());
  }

  // return token of matched text() matching {integer}
  Parser::symbol_type U8()
  {
    return Parser::make_U8(strtoul(text(), NULL, 10), location());
  }

  // return token of matched text() matching {float}
  Parser::symbol_type FP()
  {
    return Parser::make_F8(strtod(text(), NULL), location());
  }

  // return token of matched text() matching {character}
  Parser::symbol_type CH()
  {
    return Parser::make_U8(translate_escapes()[0], location());
  }

  // return token of matched text() matching {string}
  Parser::symbol_type CS()
  {
    return Parser::make_CS(string(), location());
  }

  // file inclusion with #include
  void include()
  {
    if (++depth > 9)
    {
      std::cerr << location() << ": too many nested #include, max is 9\n" << matcher().line() << "\n~\n";
      exit(EXIT_FAILURE);
    }
    else
    {
      const char *str = strchr(text(), '"');
      std::string name(str + 1, strlen(str) - 2);
      FILE *fd = fopen(name.c_str(), "r");
      if (fd == NULL)
      {
        std::cerr << location() << ": cannot open " << name << '\n' << matcher().line() << "\n~\n";
        exit(EXIT_FAILURE);
      }
      else
      {
        filenames.push(filename);       // save the current file name of the lexer
        filename = name;                // use the new file name for syntax error reporting
        push_matcher(new_matcher(fd));  // push current matcher, use new matcher
      }
    }
  }

  // true if finished including one or more files, to stop scanning at EOF
  bool end_of_file()
  {
    if (depth == 0)
      return true;              // no more input to read
    fclose(in());               // close current input in(), i.e. the FILE* opened in include()
    pop_matcher();              // delete current matcher, pop previous matcher to use again
    filename = filenames.top(); // restore the previous file name
    filenames.pop();
    depth--;
    return false;               // continue reading
  }

  // a symbol table
  std::set<std::string> symbols;

  // stack of saved filenames that are currently in use with #include
  std::stack<std::string> filenames;

  // recursive depth of #include
  size_t depth;

}

// Scanner class constructor
%init{
  depth = 0;
}

digit                           [0-9]
alpha                           [a-zA-Z_]
identifier                      {alpha} ( {alpha} | {digit} )*
integer                         {digit}+ | 0 [xX] [0-9a-fA-F]+
exp     	                [eE] [-+]? {digit}+
float                           {digit}+ \. {digit}* {exp}?
character                       \' ( \\. | [^\\'\n] )* \'
string                          \" ( \\. | [^\\"\n] )* \"
include                         "#" \h* "include" \h* {string}

%%

[[:space:]]+                    // skip white space
"//".*                          // ignore inline comment
"/*"(.|\n)*?"*/"                // ignore multi-line comment using a lazy regex pattern
^ \h* {include}                 { include(); }
{identifier}                    { return ID(); }
{integer}                       { return U8(); }
{float}                         { return FP(); }
{character}                     { return CH(); }
{string}                        { return CS(); }
"+="                            { return Parser::symbol_type(Parser::token::TOKEN_PA, location()); }
"-="                            { return Parser::symbol_type(Parser::token::TOKEN_NA, location()); }
"*="                            { return Parser::symbol_type(Parser::token::TOKEN_TA, location()); }
"/="                            { return Parser::symbol_type(Parser::token::TOKEN_DA, location()); }
"%="                            { return Parser::symbol_type(Parser::token::TOKEN_MA, location()); }
"&="                            { return Parser::symbol_type(Parser::token::TOKEN_AA, location()); }
"^="                            { return Parser::symbol_type(Parser::token::TOKEN_XA, location()); }
"|="                            { return Parser::symbol_type(Parser::token::TOKEN_OA, location()); }
"<<="                           { return Parser::symbol_type(Parser::token::TOKEN_LA, location()); }
">>="                           { return Parser::symbol_type(Parser::token::TOKEN_RA, location()); }
"||"                            { return Parser::symbol_type(Parser::token::TOKEN_OR, location()); }
"&&"                            { return Parser::symbol_type(Parser::token::TOKEN_AN, location()); }
"=="                            { return Parser::symbol_type(Parser::token::TOKEN_EQ, location()); }
"!="                            { return Parser::symbol_type(Parser::token::TOKEN_NE, location()); }
"<="                            { return Parser::symbol_type(Parser::token::TOKEN_LE, location()); }
">="                            { return Parser::symbol_type(Parser::token::TOKEN_GE, location()); }
"<<"                            { return Parser::symbol_type(Parser::token::TOKEN_LS, location()); }
">>"                            { return Parser::symbol_type(Parser::token::TOKEN_RS, location()); }
"++"                            { return Parser::symbol_type(Parser::token::TOKEN_PP, location()); }
"--"                            { return Parser::symbol_type(Parser::token::TOKEN_NN, location()); }
"->"                            { return Parser::symbol_type(Parser::token::TOKEN_AR, location()); }
[!#$%&()*+,\-./:;<=>?\[\]^{|}~] { return Parser::symbol_type(chr(), location()); }
<<EOF>>                         { if (end_of_file()) return Parser::make_EOF(location()); }

%%
