Contact the author Pierre.Weis@inria.fr

Created in January 1996.

One hundred lines of Caml

As an example of parsing using lazy streams, let me write a parser for the type of expressions.
The parsing process is seen as the composition of two phases: from a stream of characters obtain first a stream of lexems, and from this stream of lexems get the abstract syntax tree.

Thus, I first define the lexical analyzer, using the "genlex" lexer generator module provided with the standard library of Caml Light. I just declare the list of keywords to the make_lexer function to get the lexical analyzer.

(* The lexer using the genlex module from standard library *) 
#open "genlex";;
#let lexer = make_lexer ["let"; "="; "in"; "("; ")"; "+"; "-"; "*"; "/"];;
lexer : char stream -> token stream = <fun>

As an exemple, I create a stream of character from a string and try the lexical analyzer. It creates the (lazy) stream of lexems from the string and returns immediately. It is thus necessary to get the lexems by repeatedly accessing the head of the token stream. Once a token has been obtained that way, the stream is updated, that is, this token is automatically removed from the stream. When the stream becomes empty a Parse_failure exception is raised.

#let stream = lexer (stream_of_string "1x  + ");;
stream : token stream = <abstr>
#match stream with
  [< 'Int i >] -> i;;
- : int = 1
#match stream with
  [< '(Ident s as lexem); rest >] -> lexem;;
- : token = Ident "x"
#match stream with
  [< '(Kwd c as lexem); rest >] -> lexem;;
- : token = Kwd "+"
#match stream with
  [< '(Ident s as lexem); rest >] -> lexem;;
Uncaught exception: Parse_failure

The syntactic analyzer takes care of precedences between operators. I thus define a ``binary operator reader'', that given a list of operators with the same precedence, read a sequence of operations with those operators (note that read_binop uses its functional argument read_base to read the basic components of the operation).

(* Read an operator *)
#let read_operator operators = function
  | [< (stream_check
         (function Kwd op -> mem op operators | _ -> false)) lexem >] ->
      match lexem with Kwd op -> op | _ -> failwith "read_operator";;
read_operator : string list -> token stream -> string = <fun>
#let read_binop read_base operators =
 let rec read_rest e1 = function
 | [< (read_operator operators) op; read_base e2;
      (read_rest (Binop (op, e1, e2))) e >] -> e
 | [< >] -> e1 in
 function [< read_base e1; (read_rest e1) e >] -> e;;
read_binop :
 (token stream -> expression) -> string list -> token stream -> expression =
 <fun>

The parser is now a collection of functions that perform a recursive-descent like parsing: precedences are taken into account via a stratification of sub-parsers.

#let rec basic = function
  | [< 'Ident s >] -> Var s
  | [< 'Int i >] -> Num i
  | [< 'Kwd "("; expr e; 'Kwd ")" >] -> e

and minus = function
  | [< 'Kwd "-"; basic e >] ->
      begin match e with
      | Num i -> Num (-i)
      | _ -> failwith "unary minus"
      end
  | [< basic e >] -> e

and multiplicative stream = read_binop minus ["*"; "/"] stream
and additive stream = read_binop multiplicative ["+"; "-"] stream

and expr = function
  | [< 'Kwd "let"; 'Ident x; 'Kwd "="; expr e1; 'Kwd "in"; expr e2 >] ->
       Let (x, e1, e2)
  | [< additive e >] -> e;;
basic : token stream -> expression = <fun>
minus : token stream -> expression = <fun>
multiplicative : token stream -> expression = <fun>
additive : token stream -> expression = <fun>
expr : token stream -> expression = <fun>

#let parse_expr s = expr (lexer (stream_of_string s));;
parse_expr : string -> expression = <fun>

Using the parser, we easily type in complex expressions:

#eval [] (parse_expr "let x = 2 in x * x + 3");;
- : int = 7


Caml home page Last modified: Friday, March 27, 1998
Copyright © 1995 - 2004, INRIA all rights reserved.

Contact the author Pierre.Weis@inria.fr