{
(*
 * fc_ast_lex.mll -- Lexer for functional C
 *
 * Ling Li, Xin Yu
 * Feb. 23, 2001, Caltech
 *)

open Symbol
open Fc_ast_parse
open Fc_ast_type

(*
 * File position.
 *)
let current_line = ref 1
let current_schar = ref 0

(*
 * Advance a line.
 *)
let set_next_line lexbuf =
   incr current_line;
   current_schar := Lexing.lexeme_end lexbuf

(*
 * Get the position of the current lexeme.
 * We assume it is all on one line.
 *)
let set_lexeme_position lexbuf =
   let line = !current_line in
   let schar = Lexing.lexeme_start lexbuf - !current_schar in
   let echar = Lexing.lexeme_end lexbuf - !current_schar in
   let file = Fc_ast_state.current_file () in
   let pos = file, line, schar, line, echar in
      Fc_ast_state.set_current_position pos;
      pos

(* Extract octal number from string. (Example: "017" -> 15) *)
let oct_of_string s = int_of_string ("0o" ^ s)

(* Convert special char (string) *)
let convert_char = function
 | "\\n"  -> '\n'
 | "\\b"  -> '\b'
 | "\\r"  -> '\r'
 | "\\t"  -> '\t'
 | "\\\\" -> '\\'
 | "\\\"" -> '"'
 | "\\'"  -> '\''
 | s      -> (* '\\' ['0'-'7'] ['0'-'7']? ['0'-'7']? *)
      let ord = oct_of_string (String.sub s 1 (String.length s - 1)) in
         char_of_int ord
}

(* Regular expressions *)
let white   = [' ' '\t' '\r']+      (* '\r': ^M in DOS format file *)
let newline = '\n'

let alpha   = ['A'-'Z' 'a'-'z']
let digit   = ['0'-'9']
let xdigit  = digit | ['a'-'f' 'A'-'F']
let alnum   = alpha | digit

(* Comments *)
let c_comment   = "/*"
let comm_end    = "*/"
let cpp_comment = "//" [^ '\n']*

(* Identifier *)
let ident = ('_' | alpha) ('_' | alnum)*

(* Numbers *)
let dec = digit+
let oct = '0' ['0'-'7']*
let hex = '0' ['x' 'X'] xdigit+
let exp = ['e' 'E'] ('+' | '-')? dec
let float1 = (dec '.' digit* | '.' dec) exp?
let float2 = dec exp

(* Character Constants *)
let single_char  = '\'' [^ '\n' '\''] '\''
let special_char = "\\n" | "\\b" | "\\r" | "\\t" | "\\\\" | "\\\"" | "\\'"
      | ('\\' ['0'-'7'] ['0'-'7']? ['0'-'7']?)

(* Special symbols *)
let special_sym = [';' '+' '-' '*' '/' '%']
    | ['=' '>' '<' '!' '|' '&' '^' '~']
    | ['(' ')' '[' ']' '{' '}' ',' '?' ':' '.']
    | "+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "^=" | "|="
    | "==" | "!=" | ">=" | "<=" | "<<" | ">>"
    | "++" | "--" | "||" | "&&" | "->" | "<<=" | ">>="
    | "sizeof"

(* Main lexer *)
rule main = parse
   (* Ignore white space & comments*)
 | white | cpp_comment {
      main lexbuf
   }
 | newline {
      (* Keep track of the current line number *)
      set_next_line lexbuf;
      main lexbuf
   }
 | c_comment {
      (* Continue until match comm_end *)
      comment lexbuf
   }

   (* numbers *)
 | oct {
      let pos = set_lexeme_position lexbuf in
         TokInt (oct_of_string (Lexing.lexeme lexbuf), pos)
   }
 | dec | hex {
      let pos = set_lexeme_position lexbuf in
         TokInt (int_of_string (Lexing.lexeme lexbuf), pos)
   }
 | float1 | float2 {
      let pos = set_lexeme_position lexbuf in
         TokFloat (float_of_string (Lexing.lexeme lexbuf), pos)
   }

   (* Character constants *)
 | single_char {
      let pos = set_lexeme_position lexbuf in
         TokChar ((Lexing.lexeme lexbuf).[1], pos)
   }
 | '\'' special_char '\'' {
      let pos = set_lexeme_position lexbuf in
      let s = Lexing.lexeme lexbuf in
         TokChar (convert_char (String.sub s 1 (String.length s - 2)), pos)
   }

   (* String constants *)
 | '"' {
      let pos1 = set_lexeme_position lexbuf in
      let strn = string_lex lexbuf in
      let pos2 = set_lexeme_position lexbuf in
         TokString (strn, (Fc_ast_util.union_pos pos1 pos2))
   }

   (* Identifiers, including keywords & built-in types *)
 | ident {
      let pos = set_lexeme_position lexbuf in
         (* Keywords & built-in types *)
         match Lexing.lexeme lexbuf with
          | "typedef" -> TokTypedef pos
          | "if"      -> TokIf      pos
          | "else"    -> TokElse    pos
          | "for"     -> TokFor     pos
          | "while"   -> TokWhile   pos
          | "return"  -> TokReturn  pos
          | "break"   -> TokBreak   pos
          | "struct"  -> TokStruct  pos
          | "char"    -> TokTypeChar   pos
          | "int"     -> TokTypeInt    pos
          | "float"   -> TokTypeFloat  pos
          | "bool"    -> TokTypeBool   pos
          | id        -> (* Either a variable or a type name *)
               let sym = Symbol.add id in
                  if Fc_ast_state.is_type sym then
                     TokTypeId (sym, pos)
                  else
                     TokId (sym, pos)
   }

   (* Special symbols *)
 | special_sym {
      let pos = set_lexeme_position lexbuf in
         match Lexing.lexeme lexbuf with
          | ";"  -> TokSemi       pos
          | "+"  -> TokPlus       pos
          | "-"  -> TokMinus      pos
          | "*"  -> TokStar       pos
          | "/"  -> TokSlash      pos
          | "%"  -> TokMod        pos
          | "++" -> TokIncr       pos
          | "--" -> TokDecr       pos
          | "="  -> TokAssign     pos
          | "+=" -> TokPlusEq     pos
          | "-=" -> TokMinusEq    pos
          | "*=" -> TokStarEq     pos
          | "/=" -> TokSlashEq    pos
          | "%=" -> TokModEq      pos
          | "||" -> TokLOr        pos
          | "&&" -> TokLAnd       pos
          | "==" -> TokEq         pos
          | "!=" -> TokNotEq      pos
          | ">"  -> TokGt         pos
          | ">=" -> TokGe         pos
          | "<"  -> TokLt         pos
          | "<=" -> TokLe         pos
          | "!"  -> TokNot        pos
          | "|"  -> TokBOr        pos
          | "&"  -> TokBAnd       pos
          | "^"  -> TokBXor       pos
          | "~"  -> TokBNot       pos
          | "|=" -> TokBOrEq      pos
          | "&=" -> TokBAndEq     pos
          | "^=" -> TokBXorEq     pos
          | "<<" -> TokLShift     pos
          | ">>" -> TokRShift     pos
          | "<<="-> TokLShiftEq   pos
          | ">>="-> TokRShiftEq   pos
          | "("  -> TokLeftParen  pos
          | ")"  -> TokRightParen pos
          | "["  -> TokLeftBrack  pos
          | "]"  -> TokRightBrack pos
          | "{"  -> TokLeftBrace  pos
          | "}"  -> TokRightBrace pos
          | ","  -> TokComma      pos
          | "?"  -> TokQues       pos
          | ":"  -> TokColon      pos
          | "."  -> TokDot        pos
          | "->" -> TokArrow      pos
          | "sizeof" -> TokSizeof pos
          | s -> raise (ParseError (pos, s ^ " not implemented"))
   }

   (* If all else fails, this is an error. *)
 | _ {
      let pos = set_lexeme_position lexbuf in
      let s = Lexing.lexeme lexbuf in
         raise (ParseError (pos,
            Printf.sprintf "illegal character '%s'" (String.escaped s)))
   }

   (* End of file. *)
 | eof {
      let pos = set_lexeme_position lexbuf in
         TokEof pos
   }

and comment = parse
 | newline {
      set_next_line lexbuf;
      comment lexbuf
   }
 | comm_end { main lexbuf }
 | _ { comment lexbuf }
 | eof {
      let pos = set_lexeme_position lexbuf in
         raise (ParseError (pos, "unterminated comment"))
   }

and string_lex = parse
 | '"' '"' { "\"" ^ string_lex lexbuf }
 | '"' { "" }
 | special_char {
      let s = Lexing.lexeme lexbuf in
         String.make 1 (convert_char s) ^ string_lex lexbuf
   }
 | _ {
      let s = Lexing.lexeme lexbuf in
         s ^ string_lex lexbuf
   }