This site is updated infrequently. For up-to-date information, please visit the new OCaml website at ocaml.org.

ocamllex and python-style indentation
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
 Date: -- (:) From: Martin Jambon Subject: Re: [Caml-list] ocamllex and python-style indentation
```Andrej Bauer wrote:
> Thanks to Andreas, I'll have a look at the "old" code.
>
> I think I understand the general idea of inserting "virtual" tokens,
> but the details confuse me still. So starting with
>
>> if True:
>>     x = 3
>>     y = (2 +
>>       4 + 5)
>> else:
>>     x = 5
>>     if False:
>>         x = 8
>>         z = 2
>
> Martin suggests the following:
>
>> {
>> if True:
>> ;
>>    {
>>    x = 3
>>    ;
>>    y = (2 +
>>    ;
>>      {
>>      4 + 5)
>>      }
>>    }
>> ;
>> else:
>> ;
>>    {
>>    x = 5
>>    ;
>>    if False:
>>    ;
>>        {
>>        x = 8
>>        ;
>>        z = 2
>>        }
>>    }
>> }
>
> I have two questions. Notice that the { ... } and ( ... ) need not be
> correctly nested (in the top half), so how are we going to deal with
> this? The second question is, why are there the separators after and
> just before "else:". I would expect separators inside { .... }, but
> not around "else".

Original example:

if True:
x = 3
y = (2 +
4 + 5)
else:
x = 5
if False:
x = 8
z = 2

For pure indentation concerns, it is equivalent to:

x
x
x
x
x
x
x
x
x

Which is parsed into:

[
Line;
Block
[
Line;
Line;
Block
[
Line
]
];
Line;
Block
[
Line;
Line
];
Block
[
Line;
Line
]
]

I wrote the following code, which does the job.  You might want to use
ocamllex instead in order to better manage newline characters (CRLF...), line
number directives and allow input from something else than a file or in_channel.

Note that the following must be rejected:

x
x
x (indentation here could be only 0, 4 or more)

But this is accepted:

x
x
x
x

You could also enforce that the indentation of a block must be the current
indentation + k, for example k=2 for the whole input.

(******************* indent_parser.ml **********************)

type indent_line = Lexing.position * (int * string)

type indent_tree =
[ `Line of (Lexing.position * string)
| `Block of (Lexing.position * indent_tree list) ]

let split s =
let len = String.length s in
let result = ref None in
try
for i = 0 to len - 1 do
if s.[i] <> ' ' then (
result := Some (i, String.sub s i (len - i));
raise Exit
)
done;
None
with Exit -> !result

let parse_lines fname ic : indent_line list =
let lines = ref [] in
let lnum = ref 0 in
try
while true do
let bol = pos_in ic in
let s = input_line ic in
incr lnum;
match split s with
None -> ()
| Some ((n, _) as x) ->
let pos = {
Lexing.pos_fname = fname;
pos_lnum = !lnum;
pos_bol = bol;
pos_cnum = bol + n;
} in
lines := (pos, x) :: !lines
done;
assert false
with End_of_file -> List.rev !lines

let parse_lines_from_file fname =
let ic = open_in fname in
try
let x = parse_lines fname ic in
close_in ic;
x
with e ->
close_in_noerr ic;
raise e

let error pos msg =
let cpos = pos.Lexing.pos_cnum - pos.Lexing.pos_bol in
let msg =
Printf.sprintf "File %S, line %i, characters %i-%i:\n%s"
pos.Lexing.pos_fname pos.Lexing.pos_lnum 0 cpos msg
in
failwith msg

let rec block_body cur_indent sub_indent cur_block l :
indent_tree list * indent_line list =
match l with
[] -> (List.rev cur_block, [])
| (pos, (n, s)) :: tl ->
if n = cur_indent then
block_body cur_indent sub_indent (`Line (pos, s) :: cur_block) tl
else if n > cur_indent then (
(match sub_indent with
None -> ()
| Some n' ->
if n <> n' then
error pos "Inconsistent indentation"
);
let sub_block, remaining =
block_body n None [ `Line (pos, s) ] tl in

block_body
cur_indent (Some n) (`Block (pos, sub_block) :: cur_block)
remaining
)
else
(List.rev cur_block, l)

let parse_indentation fname =
let l = parse_lines_from_file fname in
let result, remaining = block_body 0 None [] l in
assert (remaining = []);
result

let test () =
let fname = Filename.temp_file "test" ".ind" in
let oc = open_out fname in
output_string oc "
if True:
x = 3
y = (2 +
4 + 5)
else:
x = 5
if False:
x = 8
z = 2
";
close_out oc;

try
let result = parse_indentation fname in
Sys.remove fname;
result
with Failure msg as e ->
Printf.eprintf "%s\n%!" msg;
Sys.remove fname;
raise e

(*****************************************************************)

> Presumably the intermediate stage that I would preprocess the token
> stream would have to know about indentation levels. I have not tried
> this, but ocaml lexer will correctly match things like
>
> | '\n' [' ' '\t']* -> { INDENTATION (compute_indentation (lexeme buf)) }
>
> Yes?

Kind of.  Don't discard the rest of the line...
If you have a choice, reject tabs.
Beware of CRLF newlines (\r\n) and missing \n before the end of file.
Also ocamllex does not keep track of newlines automatically.  See the
documentation for Lexing.lexbuf.

Martin

--
http://mjambon.com/

```