(* * Lexer for the simple grammar. * * ---------------------------------------------------------------- * * Copyright (C) 1999 Jason Hickey, Caltech * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Author: Jason Hickey * jyh@cs.caltech.edu *) { open Printf open Symbol open Fc_ast_type open Fc_ast_parse open Fc_ast_state open Fc_ast_exn (* * File position. *) let current_line = ref 1 let current_schar = ref 0 (* * Advance a line. *) let set_next_line lexbuf = incr current_line; current_schar := Lexing.lexeme_end lexbuf (* * Get the position of the current lexeme. * We assume it is all on one line. *) let set_lexeme_position lexbuf = let line = !current_line in let schar = Lexing.lexeme_start lexbuf - !current_schar in let echar = Lexing.lexeme_end lexbuf - !current_schar in let file = current_file () in let pos = file, line, schar, line, echar in set_current_position pos; pos (* * Provide a buffer for building strings. *) let stringbuf = Buffer.create 19 let stringstart = ref (0, 0) let pop_string lexbuf = let s = Buffer.contents stringbuf in let sline, schar = !stringstart in let eline = !current_line in let echar = Lexing.lexeme_end lexbuf - !current_schar in let pos = current_file (), sline, schar, eline, echar in Buffer.clear stringbuf; s, pos let set_string_start lexbuf = stringstart := !current_line, Lexing.lexeme_start lexbuf - !current_schar (* * Integer conversion functions. *) let zero_code = Char.code '0' let a_code = Char.code 'a' let ca_code = Char.code 'A' let int_of_octal s = let len = String.length s in let rec loop x i = if i = len then x else let j = Char.code s.[i] - zero_code in loop (x * 8 + j) (succ i) in loop 0 0 let int_of_hex s = let len = String.length s in let rec loop x i = if i = len then x else let c = s.[i] in let j = match c with '0' .. '9' -> Char.code c - zero_code | 'a' .. 'f' -> Char.code c - a_code | 'A' .. 'F' -> Char.code c - ca_code | _ -> raise (Invalid_argument "int_of_hex") in loop (x * 16 + j) (succ i) in loop 0 0 (* * Keyword table. *) let special = ["typedef", (fun pos -> TokTypedef pos); "if", (fun pos -> TokIf pos); "else", (fun pos -> TokElse pos); "for", (fun pos -> TokFor pos); "while", (fun pos -> TokWhile pos); "return", (fun pos -> TokReturn pos); "break", (fun pos -> TokBreak pos); "struct", (fun pos -> TokStruct pos); "(", (fun pos -> TokLeftParen pos); ")", (fun pos -> TokRightParen pos); "[", (fun pos -> TokLeftBrack pos); "]", (fun pos -> TokRightBrack pos); "{", (fun pos -> TokLeftBrace pos); "}", (fun pos -> TokRightBrace pos); ";", (fun pos -> TokSemi pos); ",", (fun pos -> TokComma pos); ".", (fun pos -> TokDot pos); "?", (fun pos -> TokQuest pos); ":", (fun pos -> TokColon pos); "*", (fun pos -> TokStar pos); "%", (fun pos -> TokPercent pos); "<=", (fun pos -> TokLe pos); ">=", (fun pos -> TokGe pos); "!=", (fun pos -> TokNotEq pos); "==", (fun pos -> TokEqEq pos); "=", (fun pos -> TokEq pos); "!", (fun pos -> TokBang pos); "+", (fun pos -> TokPlus pos); "-", (fun pos -> TokMinus pos); "*", (fun pos -> TokStar pos); "/", (fun pos -> TokSlash pos); "<", (fun pos -> TokLt pos); ">", (fun pos -> TokGt pos); "->", (fun pos -> TokRightArrow pos); "&", (fun pos -> TokAmp pos); "|", (fun pos -> TokPipe pos); "^", (fun pos -> TokHat pos); "&&", (fun pos -> TokLAnd pos); "||", (fun pos -> TokLOr pos); "<<", (fun pos -> TokLShift pos); ">>", (fun pos -> TokRShift pos); "++", (fun pos -> TokPlusPlus pos); "--", (fun pos -> TokMinusMinus pos)] let symtab = List.fold_left (fun table (name, f) -> SymbolTable.add table (Symbol.add name) f) (SymbolTable.create ()) special (* * Look up the string, and return a symbol if * the lookup fails. *) let lex_symbol s pos = let sym = Symbol.add s in try (SymbolTable.find symtab sym) pos with Not_found -> if is_type sym then TokTypeId (sym, pos) else TokId (sym, pos) (* * Look up the operator. * Syntax error if the lookup fails. *) let lex_operator s pos = let sym = Symbol.add s in try (SymbolTable.find symtab sym) pos with Not_found -> raise (ParseError (pos, "illegal operator")) } let white = [' ' '\t']+ let name_prefix = ['_' 'A'-'Z' 'a'-'z'] let name_suffix = ['_' 'A'-'Z' 'a'-'z' '0'-'9'] let name = name_prefix name_suffix * let cspecial1 = ['(' ')' '[' ']' '{' '}' ';' ',' '.' '?' ':' '*' '%'] let especial = ['<' '>' '!' '='] '=' let cspecial2 = ['=' '!'] let mspecial = ['+' '-' '/' '<' '>' '&' '|']+ let decimal = ['0'-'9']+ let octal = '0' ['0'-'7']* let hex = "0x" ['0'-'9' 'a'-'f' 'A'-'F']+ let float0 = ['0'-'9']+ '.' ['0'-'9']* (('e' | 'E') ('+' | '-')? decimal)? let float1 = ['0'-'9']* '.' ['0'-'9']+ (('e' | 'E') ('+' | '-')? decimal)? let float2 = ['0'-'9']+ (('e' | 'E') ('+' | '-')? decimal) (* * Main lexer. *) rule main = parse white { main lexbuf } | '\n' { set_next_line lexbuf; main lexbuf } | octal { let pos = set_lexeme_position lexbuf in TokInt (int_of_octal (Lexing.lexeme lexbuf), pos) } | hex { let pos = set_lexeme_position lexbuf in TokInt (int_of_hex (Lexing.lexeme lexbuf), pos) } | decimal { let pos = set_lexeme_position lexbuf in TokInt (int_of_string (Lexing.lexeme lexbuf), pos) } | float0 | float1 | float2 { let pos = set_lexeme_position lexbuf in TokFloat (float_of_string (Lexing.lexeme lexbuf), pos) } | name { let pos = set_lexeme_position lexbuf in let id = Lexing.lexeme lexbuf in lex_symbol id pos } (* * Comments. *) | "//" [^ '\n']* '\n' { set_next_line lexbuf; main lexbuf } | "/*" { comment lexbuf; main lexbuf } (* * Strings and chars. *) | '"' { set_string_start lexbuf; string lexbuf } | '\'' { set_string_start lexbuf; char lexbuf } (* * Special chars. *) | cspecial1 { let pos = set_lexeme_position lexbuf in lex_operator (Lexing.lexeme lexbuf) pos } (* * Relations with equality. * We have to separate these because we don't * wan't to collect operators like "<<=" here * (those arer handled by the parser). *) | especial { let pos = set_lexeme_position lexbuf in lex_operator (Lexing.lexeme lexbuf) pos } (* * Back to special characters. *) | cspecial2 { let pos = set_lexeme_position lexbuf in lex_operator (Lexing.lexeme lexbuf) pos } (* * Multi-character specials. *) | mspecial { let pos = set_lexeme_position lexbuf in lex_operator (Lexing.lexeme lexbuf) pos } (* * All other characters are a syntax error. *) | _ { let pos = set_lexeme_position lexbuf in raise (ParseError (pos, Printf.sprintf "illegal char: '%s'" (String.escaped (Lexing.lexeme lexbuf)))) } | eof { TokEof } (* * Strings are delimited by double-quotes. * Chars may be escaped. *) and string = parse '"' { let s, pos = pop_string lexbuf in TokString (s, pos) } | '\\' { escape lexbuf; string lexbuf } | _ { let s = Lexing.lexeme lexbuf in if s.[0] = '\n' then set_next_line lexbuf; Buffer.add_string stringbuf s; string lexbuf } | eof { let s, pos = pop_string lexbuf in TokString (s, pos) } and escape = parse '\n' { set_next_line lexbuf } | 'n' { Buffer.add_char stringbuf '\n' } | 't' { Buffer.add_char stringbuf '\t' } | 'r' { Buffer.add_char stringbuf '\r' } | _ { Buffer.add_string stringbuf (Lexing.lexeme lexbuf) } (* * Characters. *) and char = parse '\\' { escape lexbuf; termchar lexbuf } | '\n' { let pos = set_lexeme_position lexbuf in set_next_line lexbuf; raise (ParseError (pos, "illegal character constant")) } | _ { Buffer.add_string stringbuf (Lexing.lexeme lexbuf); termchar lexbuf } | eof { let pos = set_lexeme_position lexbuf in raise (ParseError (pos, "illegal character constant")) } and termchar = parse '\'' { let s, pos = pop_string lexbuf in TokChar (s.[0], pos) } | '\n' { let pos = set_lexeme_position lexbuf in set_next_line lexbuf; raise (ParseError (pos, "illegal character constant")) } | _ { let pos = set_lexeme_position lexbuf in raise (ParseError (pos, "illegal character constant")) } | eof { let pos = set_lexeme_position lexbuf in raise (ParseError (pos, "illegal character constant")) } (* * Caml-type comment may be nested. * C-style comments may not. *) and comment = parse "(*" { comment lexbuf; comment lexbuf } | "*)" | "*/" { () } | eof { () } | '\n' { set_next_line lexbuf; comment lexbuf } | _ { comment lexbuf } (* * -*- * Local Variables: * Caml-master: "set" * End: * -*- *)