(* This is the same as recognizer one, except that it matches treats * alternation symmetrically. So, in particular, the "hmm" test at * the bottom succeeds. To do so, we have recognizers not just return * the unconsumed characters of the first possible match, but of all * possible matches. But thanks to suitable abstraction, we don't have * to change much code. *) structure Recognizer2 = struct (* Now a recognizer takes in a list [c1,...,cn] and returns a list * of lists of unconsumed characters. *) type recognizer = char list -> (char list) list (* returns true if any list in css is nil *) fun anynil(css:(char list) list) : bool = case css of [] => false | []::_ => true | (_::_)::rest => anynil rest (* r matches s if any of the possible matches consumes all of the * characters in s. *) fun matches (r:recognizer) (s:string) : bool = anynil (r (explode s)) (* always matches, consuming no input -- there's only one result, * which is the unconsumed list of cs. *) val always : recognizer = fn cs => [cs] (* never matches -- there are no results *) val never : recognizer = fn cs => [] (* matches when the first character satisfies predicate p -- only * one possible result. *) fun satisfy (p:char->bool) : recognizer = fn cs => (case cs of c::rest => if p c then [rest] else [] | [] => []) (* matches any alphabetic character *) val alpha : recognizer = satisfy Char.isAlpha (* matches any digit *) val digit : recognizer = satisfy Char.isDigit (* matches only the character c *) fun C (c:char) : recognizer = satisfy (fn c' => c' = c) val a : recognizer = C #"a" val b : recognizer = C #"b" (* matches any character other than c *) fun notC (c:char) : recognizer = satisfy (fn c' => c' <> c) val not_a : recognizer = notC #"a" (* matches when we are at the end of the list of characters *) val eof : recognizer = fn cs => (case cs of [] => [[]] | _::_ => []) (* matches if either r1 matches or r2 matches -- notice that * we concatenate the results of both parsers. *) fun alt (r1:recognizer) (r2:recognizer) : recognizer = fn cs => (r1 cs) @ (r2 cs) val a_or_b = alt a b val alpha_or_digit = alt alpha digit (* matches if r1 matches, and r2 matches the remainder -- this is * tricky because we must run r2 on all of the possible matches for * r1, and collect up all the results. *) fun seq (r1:recognizer) (r2:recognizer) : recognizer = fn cs => foldr (fn (cs,a) => (r2 cs) @ a) nil (r1 cs) val a_then_b = seq a b val alpha_then_digit_then_b = seq alpha (seq digit b) fun uncurry (f : 'a -> 'b -> 'c) : ('a * 'b) -> 'c = fn (x,y) => f x y fun alts (rs : recognizer list) : recognizer = foldr (uncurry alt) never rs val a_or_b_or_c = alts [a,b,C #"c"] val hex_digit = alts (map C (explode "0123456789abcdef")) fun seqs (rs : recognizer list) : recognizer = foldr (uncurry seq) always rs val a_then_b_then_c = seqs [a,b,C #"c"] (* Kleene's star: matches zero or more occurrences of r. * Because this is recursive, we must be careful... *) fun star (r:recognizer) : recognizer = fn cs => (alt (seq r (star r)) always) cs (* matches one or more occurrences of r *) fun plus (r:recognizer) : recognizer = seq r (star r) (* matches any string built from one or more digits *) val number = plus digit (* matches any string built from one or more alphabetic characters *) val simple_var = plus alpha (* a more realistic recognizer for ML identifiers -- they must * start with an alphabetic character, underscore, or single-quote, * and can then have zero or more alphabetic characters, underscores, * single-quotes, or digits. *) val var = let val non_dig = alts [alpha,C #"_", C #"'"] in seq non_dig (star (alt non_dig digit)) end (* C style comments -- they start with "/*", followed by any character, * terminated by "*/". *) val c_comment = seqs [C #"/", C #"*", star (alt (notC #"*") (seq (C #"*") (notC #"/"))), C #"*", C #"/"] (* white space -- e.g., zero or more spaces, tabs, newlines, linefeeds, * or comments. *) val white = star (alts [C #" ", C #"\n", C #"\t", C #"\r", c_comment]) val w = white (* Scheme-style s-expressions -- numbers, variables, or parentheses * wrapped around zero or more s-expressions. Note that we also must * account for white space. *) val rec sexp : recognizer = fn cs => seqs [w, alts [ number, simple_var, seqs [ C #"(", w, star sexp, w, C #")"] ], w] cs val hmm = seq (alt a (seq a a)) (alt b (seq b b)) val _ = matches hmm "abb" end