(*********************************************************************) (* This structure defines a set of generic combinators for building *) (* parsers -- routines that pattern match a list of values and build *) (* results. This generalizes the recognizer in two critical ways: *) (* First, it returns a computed value instead of just matching *) (* against the input stream. Second, it can operate over arbitrary *) (* lists, instead of just lists of characters. This will make it *) (* much easier to break the process of parsing into modular, re- *) (* usable components. *) (*********************************************************************) structure GenericParsing = struct (* if [p] is a ('c,'a) parser, then informally, it consume lists of 'c * values & produces an 'a value. In practice, p applied to some list cs * will return [] if the parser fails to match the list, and [(a1,cs1),... * (an,csn)] if it succeeds, where each ai is an 'a value, and each csi is * a suffix of cs that was not consumed by the parser. *) type ('c,'a) parser = 'c list -> ('a * ('c list)) list (* the [always x] parser consumes no input and always returns x *) fun always(x:'a) : ('c,'a) parser = fn cs => [(x,cs)] (* the [never] parser fails, regardless of the input *) val never : ('c,'a) parser = fn cs => [] (* the [eof] parser succeeds only when we are at the end of the file *) val eof : ('c,unit) parser = fn cs => case cs of [] => [((),[])] | _ => [] (* the [alt(p1,p2)] parser first tries to parse with p1 and if it fails, * tries p2. *) fun alt(p1:('c,'a) parser, p2:('c,'a) parser) : ('c,'a) parser = fn cs => (p1 cs) @ (p2 cs) (* the parser [alts[p1,p2,...,pn]] generalizes alt to a list of parsers *) fun alts(ps:('c,'a) parser list) : ('c,'a) parser = foldr alt never ps (* the parser [satisfy pred] reads a token [h] from the input list -- if * [pred h] returns true, then the parser succeeds, consuming and return * [h] as the result. If [pred h] returns false, then the parser fails. *) fun satisfy(pred:'c -> bool) : ('c,'c) parser = fn cs => (case cs of [] => [] | h::t => if pred h then [(h,t)] else []) (* the parser [cat(p1,p2)] succeeds if the input list can be split into a * a prefix that matches p1, followed by a suffix that matches p2. *) fun seq(p1:('c,'a) parser, p2:('c,'b) parser) : ('c,'a * 'b) parser = fn cs => foldr (fn ((v1,cs1),a) => (List.map (fn (v2,cs2) => ((v1,v2),cs2)) (p2 cs1)) @ a) nil (p1 cs) (* if the parser [p] succeeds and returns value [v], then [map f p] succeeds * and returns [f v]. So map provides a way to transform a parser that * return values of one type to a parser that returns values of another * type. *) fun map(f:'a->'b)(p:('c,'a) parser) : ('c,'b) parser = fn cs => List.map (fn (v,cs) => (f v,cs)) (p cs) (* the parser [cons(p1,p2)] is like [seq(p1,p2)] but adds the value returned * by [p1] to the front of the list returned by [p2]. *) fun cons(p1:('c,'a) parser, p2:('c,'a list) parser) : ('c,'a list) parser = map (op ::) (seq (p1,p2)) (* the parser [L[p1,p2,...,pn]] is an abbreviation for * [cons(p1,(cons(p2,...,(cons pn,always nil))))]. *) fun L(ps:('c,'a) parser list) : ('c,'a list) parser = foldr cons (always nil) ps (* the parser [star p] matches zero or more occurrences of strings that p * matches, returning the result as a list. *) fun star(p:('c,'a) parser) : ('c,'a list) parser = fn cs => (alt(cons(p,star p),always nil)) cs (* [plus p] matches one or more occurrences of strings that p matches, * returning the result as a list. *) fun plus(p:('c,'a) parser) : ('c,'a list) parser = cons(p,star p) (* [opt p] matches an optional [p] *) fun opt(p:('c,'a) parser) : ('c,'a option) parser = alt (map SOME p, always NONE) (* [U p] matches when [p] does, but maps the result to the dummy value () *) fun U(p:('c,'a) parser) : ('c,unit) parser = map (fn _ => ()) p end (* struct Parsing *) (*********************************************************************) (* This structure uses the GenericParsing routines to build some *) (* useful parsing routines for lists of characters *) (*********************************************************************) structure CharParsing = struct (* inherit all of the definitions from GenericParsing *) open GenericParsing (* matches the character c and returns the character c *) fun C(c:char) : (char,char) parser = satisfy (fn h => c = h) (* matches any character x not equal to c and returns x *) fun notC(c:char) : (char,char) parser = satisfy (fn h => c <> h) (* this parser matches any character and returns it *) val anyC : (char,char) parser = satisfy (fn _ => true) (* [lc_alpha] matches any lower-case letter and returns it*) val lc_alpha : (char,char) parser = alts (List.map C (explode "abcdefghijklmnopqrstuvwxyz")) (* [uc_alpha] matches any upper-case letter and returns it*) val uc_alpha : (char,char) parser = alts (List.map C (explode "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) (* [alpha] matches any lower-case or upper-case letter and returns it*) val alpha : (char,char) parser = alt (lc_alpha, uc_alpha) (* [dig] matches any digit, returning the result as a character *) val dig : (char,char) parser = alts (List.map C (explode "0123456789")) (* [digit] is similar to [dig], but maps the result to the * corresponding integer *) val digit : (char,int) parser = map (fn c => Char.ord c - Char.ord #"0") dig val underscore : (char,char) parser = C #"_" (* [identifier] matches strings that start with an alphabetic character or * underscore, followed by zero or more alphabetic characters, digits, or * underscores. The resulting string is returned as the result. *) val identifier : (char,string) parser = map implode (cons (alts [alpha,underscore], star (alts [alpha, dig, underscore]))) (* [integer] matches strings of digits and returns the corresponding integer * value *) val integer : (char,int) parser = map (foldl (fn (d,a) => a*10 + d) 0) (plus digit) (* [white] matches spaces, tabs, newlines, and carriage returns *) val white : (char,unit) parser = U(plus (alts [C #" ", C #"\t", C #"\n", C #"\r"])) (* [comment] matches ML-style comments that begin with (* and end with *). * However, note that unlike ML, this won't handle nested comments! Can * you transform it so it does? *) val comment : (char,unit) parser = U(seq ((seq(C #"(", C #"*")), seq(star (alts [U(notC #"*"), U(seq(C #"*",notC #")"))]), (seq(C #"*", C #")"))))) (* [whitespace] matches zero or more comments, spaces, tabs, etc. *) val whitespace = U(star (alt (comment,white))) (* [S "foo"] matches the string "foo" *) fun S(s:string) : (char,unit) parser = U(L(List.map C (explode s))) end (**********************************************************************) (* This structure uses the character parsing to build a tokenizer *) (* (a.k.a. a lexer), which maps strings to a list of tokens, removing *) (* the white-space (including comments) along the way. *) (**********************************************************************) structure Tokenizer = struct open CharParsing (* Our tokens will consist of integers, parentheses, and +, *, -, and / * symbols *) datatype token = INT of int (* 1234, 42, etc. *) | LET (* "let" *) | IN (* "in" *) | END (* "end" *) | ID of string (* greg, is, _this_works_too, greg42 *) | LPAREN (* "(" *) | RPAREN (* ")" *) | PLUS (* "+" *) | MINUS (* "-" *) | TIMES (* "*" *) | DIV (* "/" *) | EQ (* a parser that matches strings corresponding to tokens and returns * the appropriate [token] value. *) val token_parser : (char,token) parser = alts [map INT integer, map (fn _ => LET) (S "let"), map (fn _ => IN) (S "in"), map (fn _ => END) (S "end"), map ID identifier, map (fn _ => LPAREN) (C #"("), map (fn _ => RPAREN) (C #")"), map (fn _ => PLUS) (C #"+"), map (fn _ => MINUS) (C #"-"), map (fn _ => TIMES) (C #"*"), map (fn _ => DIV) (C #"/"), map (fn _ => EQ) (C #"=") ] fun fst x y = map (fn (x,y) => x) (seq(x,y)) fun snd x y = map (fn (x,y) => y) (seq(x,y)) (* tokens_parser parses a list of tokens, ignoring the whitespace between * them. *) val tokens_parser : (char,token list) parser = fst (fst (star (snd whitespace token_parser)) whitespace) eof (* we'll throw this exception when we run into an unexpected character *) exception LexicalError (* tokenize s first explodes s into a list of characters, then runs * the tokens_parser on it to try to get a list of tokens. We arbitrarily * return the first successful parse. *) fun tokenize(s:string) : token list = case (tokens_parser (explode s)) of [] => raise LexicalError | (ts,_)::_ => ts (* for fun, lets us see all the possible tokenizations of the string *) fun all_tokenize(s:string) : (token list) list = List.map (fn (ts,_) => ts) (List.filter (fn (ts,cs) => case cs of [] => true | _ => false) (tokens_parser (explode s))) end (* struct Tokenizer *) (**********************************************************************) (* In this structure, we build a parser and evaluator for a little *) (* calculator language with Scheme-like syntax. The parser uses the *) (* tokenize function to break the input into a useful list of *) (* tokens, so we don't have to worry about white space or this sort *) (* of thing. *) (**********************************************************************) structure Evaluator = struct open Tokenizer (* the int_p parser matches when the first token is an INT, and * returns the int value carried by the INT data constructor. *) val int_p : (token,int) parser = map (fn (INT i) => i) (satisfy (fn t => case t of INT _ => true | _ => false)) (* tok_p t matches when the first token is equal to t, returns unit *) fun tok_p(t:token) : (token,unit) parser = U (satisfy (fn h => t = h)) (* match the LPAREN token *) val lparen_p : (token,unit) parser = tok_p LPAREN (* match the RPAREN token *) val rparen_p : (token,unit) parser = tok_p RPAREN (* match the PLUS token, and map the result to the + function *) val plus_p : (token,int*int->int) parser = map (fn _ => op +) (tok_p PLUS) (* match the MINUS token, and map the result to the - function *) val minus_p : (token,int*int->int) parser = map (fn _ => op -) (tok_p MINUS) (* match the TIMES token, and map the result to the * function *) val times_p : (token,int*int->int) parser = map (fn _ => op * ) (tok_p TIMES) (* match the DIV token, and map the result to the div function *) val div_p : (token,int*int->int) parser = map (fn _ => op div) (tok_p DIV) (* [comp_exp_p(operator_p,exp_p)] matches "(" operator_p exp_p exp_p ")". * It returns the result of applying the operator to the two expressions. *) fun comp_exp_p(operator_p, exp_p) : (token,int) parser = map (fn (lp, (f, (e1, (e2, rp)))) => f(e1,e2)) (seq (lparen_p, seq (operator_p, seq (exp_p, seq (exp_p, rparen_p))))) (* the recursive [exp_p] matches either an integer or expressions of the * form "(" op exp exp ")" where op is either +, -, *, or /. *) val rec exp_p : (token,int) parser = fn ts => alts [ int_p, alts (List.map (fn p => comp_exp_p(p,exp_p)) [plus_p, minus_p, times_p, div_p]) ] ts exception ParseError (* parse a string into an expression and evaluate the expression to get an * integer. We start by using the [tokenize] function to break the string * into a list of tokens, then we run the [exp_p] parser on it, checking * to see that we consume all of the input. *) fun evaluate(s:string) : int = case exp_p (tokenize s) of (i,[])::_ => i | _ => raise ParseError fun evaluates(s:string) : int list = List.map (fn (i,_) => i) (exp_p (tokenize s)) end (* struct Evaluator *) structure MLEvaluator = struct open Evaluator type var = string datatype binop = Plus | Minus | Times | Div datatype exp = Int of int | Binop of exp * binop * exp | Var of var | Let of var * exp * exp val let_p = tok_p LET val in_p = tok_p IN val end_p = tok_p END val eq_p = tok_p EQ val lp_p = tok_p LPAREN val rp_p = tok_p RPAREN val id_p : (token,string) parser = map (fn (ID x) => x) (satisfy (fn t => case t of (ID _) => true | _ => false)) (* exp -> term | term + exp | term - exp *) val rec exp_p : (token,exp) parser = fn cs => alts [ term_p, map (fn (e1,e2) => Binop(e1,Plus,e2)) (seq(term_p,snd plus_p exp_p)), map (fn (e1,e2) => Binop(e1,Minus,e2)) (seq(term_p,snd minus_p exp_p)) ] cs (* term -> factor | factor * term | factor / term *) and term_p : (token,exp) parser = fn cs => alts [ factor_p, map (fn (e1,e2) => Binop(e1,Times,e2)) (seq(factor_p,snd times_p term_p)), map (fn (e1,e2) => Binop(e1,Div,e2)) (seq(factor_p,snd div_p term_p)) ] cs (* factor -> int | var | ( exp ) | LET var = exp IN exp END *) and factor_p : (token,exp) parser = fn cs => alts [ map Int int_p, map Var id_p, map (fn (_,(e,_)) => e) (seq(lp_p,seq(exp_p,rp_p))), map (fn (x,(e1,e2)) => Let(x,e1,e2)) (snd let_p (seq(id_p, snd eq_p (seq(exp_p, snd in_p (fst exp_p end_p)))))) ] cs val ml_p : (token,exp) parser = fst exp_p eof fun parse_ml_exp(s:string) : exp = case ml_p (tokenize s) of (e,[])::_ => e | _ => raise ParseError end