(*
 * Lexer for the simple grammar.
 *
 * ----------------------------------------------------------------
 *
 * Copyright (C) 1999 Jason Hickey, Caltech
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Author: Jason Hickey
 * jyh@cs.caltech.edu
 *)

{
open Printf
open Symbol
open Fc_ast_type
open Fc_ast_parse
open Fc_ast_state
open Fc_ast_exn

(*
 * File position.
 *)
let current_line = ref 1
let current_schar = ref 0

(*
 * Advance a line.
 *)
let set_next_line lexbuf =
   incr current_line;
   current_schar := Lexing.lexeme_end lexbuf

(*
 * Get the position of the current lexeme.
 * We assume it is all on one line.
 *)
let set_lexeme_position lexbuf =
   let line = !current_line in
   let schar = Lexing.lexeme_start lexbuf - !current_schar in
   let echar = Lexing.lexeme_end lexbuf - !current_schar in
   let file = current_file () in
   let pos = file, line, schar, line, echar in
      set_current_position pos;
      pos

(*
 * Provide a buffer for building strings.
 *)
let stringbuf = Buffer.create 19
let stringstart = ref (0, 0)

let pop_string lexbuf =
   let s = Buffer.contents stringbuf in
   let sline, schar = !stringstart in
   let eline = !current_line in
   let echar = Lexing.lexeme_end lexbuf - !current_schar in
   let pos = current_file (), sline, schar, eline, echar in
      Buffer.clear stringbuf;
      s, pos

let set_string_start lexbuf =
   stringstart := !current_line, Lexing.lexeme_start lexbuf - !current_schar

(*
 * Integer conversion functions.
 *)
let zero_code = Char.code '0'
let a_code    = Char.code 'a'
let ca_code   = Char.code 'A'

let int_of_octal s =
   let len = String.length s in
   let rec loop x i =
      if i = len then
         x
      else
         let j = Char.code s.[i] - zero_code in
            loop (x * 8 + j) (succ i)
   in
      loop 0 0

let int_of_hex s =
   let len = String.length s in
   let rec loop x i =
      if i = len then
         x
      else
         let c = s.[i] in
         let j =
            match c with
               '0' .. '9' ->
                  Char.code c - zero_code
             | 'a' .. 'f' ->
                  Char.code c - a_code
             | 'A' .. 'F' ->
                  Char.code c - ca_code
             | _ ->
                  raise (Invalid_argument "int_of_hex")
         in
            loop (x * 16 + j) (succ i)
   in
      loop 0 0

(*
 * Keyword table.
 *)
let special =
   ["typedef", (fun pos -> TokTypedef pos);
    "if",      (fun pos -> TokIf pos);
    "else",    (fun pos -> TokElse pos);
    "for",     (fun pos -> TokFor pos);
    "while",   (fun pos -> TokWhile pos);
    "return",  (fun pos -> TokReturn pos);
    "break",   (fun pos -> TokBreak pos);
    "struct",  (fun pos -> TokStruct pos);
    "(",       (fun pos -> TokLeftParen pos);
    ")",       (fun pos -> TokRightParen pos);
    "[",       (fun pos -> TokLeftBrack pos);
    "]",       (fun pos -> TokRightBrack pos);
    "{",       (fun pos -> TokLeftBrace pos);
    "}",       (fun pos -> TokRightBrace pos);
    ";",       (fun pos -> TokSemi pos);
    ",",       (fun pos -> TokComma pos);
    ".",       (fun pos -> TokDot pos);
    "?",       (fun pos -> TokQuest pos);
    ":",       (fun pos -> TokColon pos);
    "*",       (fun pos -> TokStar pos);
    "%",       (fun pos -> TokPercent pos);
    "<=",      (fun pos -> TokLe pos);
    ">=",      (fun pos -> TokGe pos);
    "!=",      (fun pos -> TokNotEq pos);
    "==",      (fun pos -> TokEqEq pos);
    "=",       (fun pos -> TokEq pos);
    "!",       (fun pos -> TokBang pos);
    "+",       (fun pos -> TokPlus pos);
    "-",       (fun pos -> TokMinus pos);
    "*",       (fun pos -> TokStar pos);
    "/",       (fun pos -> TokSlash pos);
    "<",       (fun pos -> TokLt pos);
    ">",       (fun pos -> TokGt pos);
    "->",      (fun pos -> TokRightArrow pos);
    "&",       (fun pos -> TokAmp pos);
    "|",       (fun pos -> TokPipe pos);
    "^",       (fun pos -> TokHat pos);
    "&&",      (fun pos -> TokLAnd pos);
    "||",      (fun pos -> TokLOr pos);
    "<<",      (fun pos -> TokLShift pos);
    ">>",      (fun pos -> TokRShift pos);
    "++",      (fun pos -> TokPlusPlus pos);
    "--",      (fun pos -> TokMinusMinus pos)]

let symtab =
   List.fold_left (fun table (name, f) ->
      SymbolTable.add table (Symbol.add name) f) (SymbolTable.create ()) special

(*
 * Look up the string, and return a symbol if
 * the lookup fails.
 *)
let lex_symbol s pos =
   let sym = Symbol.add s in
      try (SymbolTable.find symtab sym) pos with
         Not_found ->
            if is_type sym then
               TokTypeId (sym, pos)
            else
               TokId (sym, pos)

(*
 * Look up the operator.
 * Syntax error if the lookup fails.
 *)
let lex_operator s pos =
   let sym = Symbol.add s in
      try (SymbolTable.find symtab sym) pos with
         Not_found ->
            raise (ParseError (pos, "illegal operator"))
}

let white = [' ' '\t']+

let name_prefix = ['_' 'A'-'Z' 'a'-'z']
let name_suffix = ['_' 'A'-'Z' 'a'-'z' '0'-'9']
let name = name_prefix name_suffix *

let cspecial1 = ['(' ')' '[' ']' '{' '}' ';' ',' '.' '?' ':' '*' '%']
let especial = ['<' '>' '!' '='] '='
let cspecial2 = ['=' '!']
let mspecial = ['+' '-' '/' '<' '>' '&' '|']+

let decimal = ['0'-'9']+
let octal = '0' ['0'-'7']*
let hex = "0x" ['0'-'9' 'a'-'f' 'A'-'F']+
let float0 = ['0'-'9']+ '.' ['0'-'9']* (('e' | 'E') ('+' | '-')? decimal)?
let float1 = ['0'-'9']* '.' ['0'-'9']+ (('e' | 'E') ('+' | '-')? decimal)?
let float2 = ['0'-'9']+ (('e' | 'E') ('+' | '-')? decimal)

(*
 * Main lexer.
 *)
rule main =
   parse white
      { main lexbuf }
 | '\n'
      { set_next_line lexbuf; main lexbuf }
 | octal
      { let pos = set_lexeme_position lexbuf in
           TokInt (int_of_octal (Lexing.lexeme lexbuf), pos)
      }
 | hex
      { let pos = set_lexeme_position lexbuf in
           TokInt (int_of_hex (Lexing.lexeme lexbuf), pos)
      }
 | decimal
      { let pos = set_lexeme_position lexbuf in
           TokInt (int_of_string (Lexing.lexeme lexbuf), pos)
      }
 | float0 | float1 | float2
      { let pos = set_lexeme_position lexbuf in
           TokFloat (float_of_string (Lexing.lexeme lexbuf), pos)
      }
 | name
      { let pos = set_lexeme_position lexbuf in
        let id = Lexing.lexeme lexbuf in
           lex_symbol id pos
      }

   (*
    * Comments.
    *)
 | "//" [^ '\n']* '\n'
      { set_next_line lexbuf; main lexbuf }
 | "/*"
      { comment lexbuf; main lexbuf }

   (*
    * Strings and chars.
    *)
 | '"'
      { set_string_start lexbuf;
        string lexbuf
      }
 | '\''
      { set_string_start lexbuf;
        char lexbuf
      }

   (*
    * Special chars.
    *)
 | cspecial1
      { let pos = set_lexeme_position lexbuf in
           lex_operator (Lexing.lexeme lexbuf) pos
      }

   (*
    * Relations with equality.
    * We have to separate these because we don't
    * wan't to collect operators like "<<=" here
    * (those arer handled by the parser).
    *)
 | especial
     { let pos = set_lexeme_position lexbuf in
          lex_operator (Lexing.lexeme lexbuf) pos
     }

   (*
    * Back to special characters.
    *)
 | cspecial2
     { let pos = set_lexeme_position lexbuf in
          lex_operator (Lexing.lexeme lexbuf) pos
     }

   (*
    * Multi-character specials.
    *)
 | mspecial
      { let pos = set_lexeme_position lexbuf in
           lex_operator (Lexing.lexeme lexbuf) pos
      }

   (*
    * All other characters are a syntax error.
    *)
 | _
      { let pos = set_lexeme_position lexbuf in
           raise (ParseError (pos, Printf.sprintf "illegal char: '%s'"
              (String.escaped (Lexing.lexeme lexbuf))))
      }

 | eof
      { TokEof }

(*
 * Strings are delimited by double-quotes.
 * Chars may be escaped.
 *)
and string =
   parse '"'
      { let s, pos = pop_string lexbuf in
           TokString (s, pos)
      }
 | '\\'
      { escape lexbuf;
        string lexbuf
      }
 | _
      { let s = Lexing.lexeme lexbuf in
           if s.[0] = '\n' then
              set_next_line lexbuf;
           Buffer.add_string stringbuf s;
           string lexbuf
      }
 | eof
      { let s, pos = pop_string lexbuf in
           TokString (s, pos)
      }

and escape =
   parse '\n'
      { set_next_line lexbuf }
  | 'n'
      { Buffer.add_char stringbuf '\n' }
  | 't'
      { Buffer.add_char stringbuf '\t' }
  | 'r'
      { Buffer.add_char stringbuf '\r' }
  | _
      { Buffer.add_string stringbuf (Lexing.lexeme lexbuf) }

(*
 * Characters.
 *)
and char =
   parse '\\'
      { escape lexbuf;
        termchar lexbuf
      }
 | '\n'
      { let pos = set_lexeme_position lexbuf in
           set_next_line lexbuf;
           raise (ParseError (pos, "illegal character constant"))
      }
 | _
      { Buffer.add_string stringbuf (Lexing.lexeme lexbuf);
        termchar lexbuf
      }
 | eof
      { let pos = set_lexeme_position lexbuf in
           raise (ParseError (pos, "illegal character constant"))
      }

and termchar =
   parse '\''
      { let s, pos = pop_string lexbuf in
           TokChar (s.[0], pos)
      }
 | '\n'
      { let pos = set_lexeme_position lexbuf in
           set_next_line lexbuf;
           raise (ParseError (pos, "illegal character constant"))
      }
 | _
      { let pos = set_lexeme_position lexbuf in
           raise (ParseError (pos, "illegal character constant"))
      }
 | eof
      { let pos = set_lexeme_position lexbuf in
           raise (ParseError (pos, "illegal character constant"))
      }

(*
 * Caml-type comment may be nested.
 * C-style comments may not.
 *)
and comment =
   parse "(*"
      { comment lexbuf; comment lexbuf }
 | "*)" | "*/"
      { () }
 | eof
      { () }
 | '\n'
      { set_next_line lexbuf;
        comment lexbuf
      }
 | _
      { comment lexbuf }

(*
 * -*-
 * Local Variables:
 * Caml-master: "set"
 * End:
 * -*-
 *)