Version française
Home     About     Download     Resources     Contact us    
Browse thread
ocamllex and python-style indentation
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Martin Jambon <martin.jambon@e...>
Subject: Re: [Caml-list] ocamllex and python-style indentation
Andrej Bauer wrote:
> Thanks to Andreas, I'll have a look at the "old" code.
> 
> I think I understand the general idea of inserting "virtual" tokens,
> but the details confuse me still. So starting with
> 
>> if True:
>>     x = 3
>>     y = (2 +
>>       4 + 5)
>> else:
>>     x = 5
>>     if False:
>>         x = 8
>>         z = 2
> 
> Martin suggests the following:
> 
>> {
>> if True:
>> ;
>>    {
>>    x = 3
>>    ;
>>    y = (2 +
>>    ;
>>      {
>>      4 + 5)
>>      }
>>    }
>> ;
>> else:
>> ;
>>    {
>>    x = 5
>>    ;
>>    if False:
>>    ;
>>        {
>>        x = 8
>>        ;
>>        z = 2
>>        }
>>    }
>> }
> 
> I have two questions. Notice that the { ... } and ( ... ) need not be
> correctly nested (in the top half), so how are we going to deal with
> this? The second question is, why are there the separators after and
> just before "else:". I would expect separators inside { .... }, but
> not around "else".

Original example:

if True:
    x = 3
    y = (2 +
      4 + 5)
else:
    x = 5
    if False:
        x = 8
        z = 2


For pure indentation concerns, it is equivalent to:

x
  x
  x
    x
x
  x
  x
    x
    x


Which is parsed into:

[
  Line;
  Block
    [
      Line;
      Line;
      Block
       [
         Line
       ]
    ];
  Line;
  Block
    [
       Line;
       Line
    ];
  Block
    [
      Line;
      Line
    ]
]


I wrote the following code, which does the job.  You might want to use
ocamllex instead in order to better manage newline characters (CRLF...), line
number directives and allow input from something else than a file or in_channel.


Note that the following must be rejected:

x
    x
  x (indentation here could be only 0, 4 or more)


But this is accepted:

x
    x
x
  x


You could also enforce that the indentation of a block must be the current
indentation + k, for example k=2 for the whole input.



(******************* indent_parser.ml **********************)

type indent_line = Lexing.position * (int * string)

type indent_tree =
    [ `Line of (Lexing.position * string)
    | `Block of (Lexing.position * indent_tree list) ]


let split s =
  let len = String.length s in
  let result = ref None in
  try
    for i = 0 to len - 1 do
      if s.[i] <> ' ' then (
	result := Some (i, String.sub s i (len - i));
	raise Exit
      )
    done;
     None
  with Exit -> !result

let parse_lines fname ic : indent_line list =
  let lines = ref [] in
  let lnum = ref 0 in
  try
    while true do
      let bol = pos_in ic in
      let s = input_line ic in
      incr lnum;
      match split s with
	  None -> ()
	| Some ((n, _) as x) ->
	    let pos = {
	      Lexing.pos_fname = fname;
	      pos_lnum = !lnum;
	      pos_bol = bol;
	      pos_cnum = bol + n;
	    } in
	    lines := (pos, x) :: !lines
    done;
    assert false
  with End_of_file -> List.rev !lines

let parse_lines_from_file fname =
  let ic = open_in fname in
  try
    let x = parse_lines fname ic in
    close_in ic;
    x
  with e ->
    close_in_noerr ic;
    raise e

let error pos msg =
  let cpos = pos.Lexing.pos_cnum - pos.Lexing.pos_bol in
  let msg =
    Printf.sprintf "File %S, line %i, characters %i-%i:\n%s"
      pos.Lexing.pos_fname pos.Lexing.pos_lnum 0 cpos msg
  in
  failwith msg

let rec block_body cur_indent sub_indent cur_block l :
    indent_tree list * indent_line list =
  match l with
      [] -> (List.rev cur_block, [])
    | (pos, (n, s)) :: tl ->
	if n = cur_indent then
	  block_body cur_indent sub_indent (`Line (pos, s) :: cur_block) tl
	else if n > cur_indent then (
	  (match sub_indent with
	       None -> ()
	     | Some n' ->
		 if n <> n' then
		   error pos "Inconsistent indentation"
	  );
	  let sub_block, remaining =
	    block_body n None [ `Line (pos, s) ] tl in

	  block_body
	    cur_indent (Some n) (`Block (pos, sub_block) :: cur_block)
	    remaining
	)
	else
	  (List.rev cur_block, l)


let parse_indentation fname =
  let l = parse_lines_from_file fname in
  let result, remaining = block_body 0 None [] l in
  assert (remaining = []);
  result


let test () =
  let fname = Filename.temp_file "test" ".ind" in
  let oc = open_out fname in
  output_string oc "
if True:
    x = 3
    y = (2 +
      4 + 5)
else:
    x = 5
    if False:
        x = 8
        z = 2
";
  close_out oc;

  try
    let result = parse_indentation fname in
    Sys.remove fname;
    result
  with Failure msg as e ->
    Printf.eprintf "%s\n%!" msg;
    Sys.remove fname;
    raise e


(*****************************************************************)






> Presumably the intermediate stage that I would preprocess the token
> stream would have to know about indentation levels. I have not tried
> this, but ocaml lexer will correctly match things like
> 
> | '\n' [' ' '\t']* -> { INDENTATION (compute_indentation (lexeme buf)) }
> 
> Yes?

Kind of.  Don't discard the rest of the line...
If you have a choice, reject tabs.
Beware of CRLF newlines (\r\n) and missing \n before the end of file.
Also ocamllex does not keep track of newlines automatically.  See the
documentation for Lexing.lexbuf.



Martin

-- 
http://mjambon.com/