Version française
Home     About     Download     Resources     Contact us    
Browse thread
Patch to 3.10.0 compiler enabling simple spell-checking
[ Home ] [ Index: by date | by threads ]
[ Search: ]

[ Message by date: previous | next ] [ Message in thread: previous | next ] [ Thread: previous | next ]
Date: -- (:)
From: Yitzhak Mandelbaum <yitzhak@r...>
Subject: Re: [Caml-list] Patch to 3.10.0 compiler enabling simple spell-checking
Very cool! Do you think there's any way you could separate it from  
the compiler, like Learner et al.'s SEMINAL work, which separates  
type error messages from the compiler?. Separation could help ensure  
this (and any other, similar) ideas don't accidentally introduce bugs  
into the compiler, and make it much easier for you to maintain. A  
very simple hack might be tod wrap ocamlc in a script that parses  
such error messages and then tokenizes the source file, looking for  
similar strings?

Cheers,
Yitzhak


On Oct 29, 2007, at 5:11 PM, Edgar Friendly wrote:

> One random little feature of GNAT that comes in handy for me is its
> habit of, when I misspell an identifier, giving me a possible  
> correction
> in its compile error message.  Spending some time with the 3.10.0
> sources, I have created a "second draft" patch creating this
> functionality in my favored language.
>
> Example:
> ========
>
> # /home/thelema/Projects/ocaml-custom/bin/ocamlc -o coml -I +lablgtk2
> lablgtk.cma gtkInit.cmo coml.ml
> File "coml.ml", line 61, characters 16-25:
> Unbound value is_arcive, possible misspelling of is_archive
>
> Impacts:
> ========
>
> Efficiency in the case of finding a mistake should be quite good,
> although this shouldn't matter too much since the compiler quits  
> pretty
> early in compilation when it finds an unbound identifier.
>
> In the case of no unbound identifiers, the cost is an extra try/with
> block around the standard lookup.  I haven't made any benchmarks,  
> though.
>
> I expect this code to have little long term maintenance issues - the
> major source of code changes was adding a "* string list" to a  
> number of
> exceptions to carry the list of possible correct spellings to the  
> point
> they get output by the compiler.  These exceptions are still usable as
> before with an empty list in this spot.
>
> It's possible the code has created opportunities for uncaught  
> exceptions
> in the compiler as I only checked for instances of "Not_found" in a  
> few
> files -- those which dealt with the Unbound_* exceptions.  Someone who
> knows the internals better might find places the "Found_nearly"
> exception that carries possible corrections might escape into.
>
>
> Dedicated to:
> Yaron Minsky and the team at Jane Street
>
> E.
> diff --git a/typing/ident.ml b/typing/ident.ml
> index a30aa7a..8bbb395 100644
> --- a/typing/ident.ml
> +++ b/typing/ident.ml
> @@ -56,6 +56,61 @@ let same i1 i2 = i1 = i2
>         then i1.stamp = i2.stamp
>         else i2.stamp = 0 && i1.name = i2.name *)
>
> +(* does various heuristics to find spelling mistakes - algorithm  
> from GNAT Ada compiler - Lots of code, but hopefully about as  
> efficient as possible *)
> +let nearly_equal s i =
> +  let is_digit c = c >= '0' && c <= '9' in
> +  let rec tails_equal s1 i1 s2 i2 = i1 >= String.length s1 || (s1. 
> [i1] = s2.[i2] && tails_equal s1 (i1+1) s2 (i2+1)) in
> +
> +  let n1 = s and n2 = i.name in
> +  let l1 = String.length n1 and l2 = String.length n2 in
> +  if l1 = 0 then l2 = 0 (* both null -> equal *)
> +  else if l2 = 0 then false (* one of n1 or n2 null -> not equal *)
> +  else if n1.[0] != n2.[0] then false (* first characters don't  
> match -> not *)
> +  else if l1 < 3 && l2 < 3 then false (* short strings are all  
> different *)
> +  else if l1 = l2 then
> +    (* look for single errors, transpositions *)
> +    let rec find_diff c =
> +      if c < (l1 - 1) then
> +	if n1.[c] != n2.[c] then
> +	  (* mismatched digits -> not equal *)
> +	  if is_digit n1.[c] && is_digit n2.[c] then false
> +	    (* single error -> equal *)
> +	  else if n1.[c+1] = n2.[c+1] && tails_equal n1 (c+2) n2 (c+2)
> +	  then true
> +	    (* transposition -> equal *)
> +	  else if n1.[c] = n2.[c+1] && n1.[c+1] = n2.[c] &&
> +	  tails_equal n1 (c+2) n2 (c+2)
> +	  then true
> +	  else false
> +	else find_diff (c+1)
> +      else (* at the end of the strings *)
> +	if is_digit n1.[c] && is_digit n2.[c]
> +	  && n1.[c] != n2.[c]
> +	then false
> +	else true
> +    in
> +    find_diff 1
> +  else if l1 = l2 - 1 then (* short by one *)
> +    let rec find_del c =
> +      if c < l1 then
> +	if n1.[c] != n2.[c] then
> +	  tails_equal n1 c n2 (c+1)
> +	else find_del (c+1)
> +      else true (* last character was deleted *)
> +    in
> +    find_del 1
> +  else if l1 = l2 + 1 then (* too long by one *)
> +    let rec find_add c =
> +      if c < l2 then
> +	if n1.[c] != n2.[c] then
> +	  tails_equal n1 (c+1) n2 c
> +	else find_add (c+1)
> +      else true (* last character added *)
> +    in
> +    find_add 1
> +  else (* lengths totally different *)
> +    false
> +	
>  let binding_time i = i.stamp
>
>  let current_time() = !currentstamp
> @@ -97,6 +152,8 @@ and 'a data =
>
>  let empty = Empty
>
> +exception Found_nearly of string list
> +
>  (* Inline expansion of height for better speed
>   * let height = function
>   *     Empty -> 0
> @@ -182,3 +239,34 @@ let rec keys_aux stack accu = function
>        keys_aux (l :: stack) (k.ident :: accu) r
>
>  let keys tbl = keys_aux [] [] tbl
> +
> +let find_nearly_equal n tbl =
> +(*  List.filter (nearly_equal name) (keys tbl) -- optimized for  
> your enjoyment*)
> +  let rec find_nearly_equal_aux stack accu = function
> +      Empty ->
> +	begin match stack with
> +            [] -> accu
> +	  | a :: l -> find_nearly_equal_aux l accu a
> +	end
> +    | Node(l, k, r, _) ->
> +	let accu' =
> +	  if nearly_equal n k.ident
> +	  then k.ident.name :: accu
> +	  else accu
> +	in
> +	find_nearly_equal_aux (l :: stack) accu' r
> +  in
> +  find_nearly_equal_aux [] [] tbl
> +
> +let find_name_with_nearly name tbl =
> +  try find_name name tbl
> +  with Not_found ->
> +(*    prerr_string "FNE: "; prerr_endline name; *)
> +    let nearlies = find_nearly_equal name tbl in
> +(*    let rec pr_list to_str = function [] -> prerr_endline  
> "Empty" | [a] -> prerr_endline (to_str a) | h :: t -> prerr_string  
> (to_str h); prerr_string ", "; pr_list to_str t
> +    in
> +    prerr_string "nearlies: ";
> +    pr_list (fun i -> i) nearlies;
> +    prerr_string "whole table: ";
> +    pr_list (fun i -> i.name) (keys tbl); *)
> +    raise (Found_nearly nearlies)
> diff --git a/typing/ident.mli b/typing/ident.mli
> index 9f7372c..7ab9327 100644
> --- a/typing/ident.mli
> +++ b/typing/ident.mli
> @@ -52,8 +52,12 @@ val print: Format.formatter -> t -> unit
>  type 'a tbl
>          (* Association tables from identifiers to type 'a. *)
>
> +exception Found_nearly of string list
> +
>  val empty: 'a tbl
>  val add: t -> 'a -> 'a tbl -> 'a tbl
>  val find_same: t -> 'a tbl -> 'a
>  val find_name: string -> 'a tbl -> 'a
>  val keys: 'a tbl -> t list
> +
> +val find_name_with_nearly: string -> 'a tbl -> 'a
> diff --git a/typing/typecore.ml b/typing/typecore.ml
> index 691eb49..adff3ad 100644
> --- a/typing/typecore.ml
> +++ b/typing/typecore.ml
> @@ -23,9 +23,9 @@ open Btype
>  open Ctype
>
>  type error =
> -    Unbound_value of Longident.t
> -  | Unbound_constructor of Longident.t
> -  | Unbound_label of Longident.t
> +    Unbound_value of Longident.t * string list
> +  | Unbound_constructor of Longident.t * string list
> +  | Unbound_label of Longident.t * string list
>    | Polymorphic_label of Longident.t
>    | Constructor_arity_mismatch of Longident.t * int * int
>    | Label_mismatch of Longident.t * (type_expr * type_expr) list
> @@ -42,11 +42,11 @@ type error =
>    | Bad_conversion of string * int * char
>    | Undefined_method of type_expr * string
>    | Undefined_inherited_method of string
> -  | Unbound_class of Longident.t
> +  | Unbound_class of Longident.t * string list
>    | Virtual_class of Longident.t
>    | Private_type of type_expr
>    | Private_label of Longident.t * type_expr
> -  | Unbound_instance_variable of string
> +  | Unbound_instance_variable of string * string list
>    | Instance_variable_not_mutable of string
>    | Not_subtype of (type_expr * type_expr) list * (type_expr *  
> type_expr) list
>    | Outside_class
> @@ -276,7 +276,7 @@ let rec build_as_type env p =
>        unify_pat env {p2 with pat_type = ty2} ty1;
>        begin match path with None -> ()
>        | Some path ->
> -          let td = try Env.find_type path env with Not_found ->  
> assert false in
> +          let td = try Env.find_type path env with Not_found |  
> Ident.Found_nearly _ -> assert false in
>            let params = List.map (fun _ -> newvar()) td.type_params in
>            match expand_head env (newty (Tconstr (path, params, ref  
> Mnil)))
>            with {desc=Tvariant row} when static_row row ->
> @@ -291,7 +291,9 @@ let build_or_pat env loc lid =
>    let path, decl =
>      try Env.lookup_type lid env
>      with Not_found ->
> -      raise(Typetexp.Error(loc, Typetexp.Unbound_type_constructor  
> lid))
> +      raise(Typetexp.Error(loc, Typetexp.Unbound_type_constructor  
> (lid,[])))
> +      | Ident.Found_nearly l ->
> +      raise(Typetexp.Error(loc, Typetexp.Unbound_type_constructor  
> (lid,l)))
>    in
>    let tyl = List.map (fun _ -> newvar()) decl.type_params in
>    let fields =
> @@ -400,7 +402,10 @@ let rec type_pat env sp =
>          try
>            Env.lookup_constructor lid env
>          with Not_found ->
> -          raise(Error(sp.ppat_loc, Unbound_constructor lid)) in
> +          raise(Error(sp.ppat_loc, Unbound_constructor (lid,[])))
> +	  | Ident.Found_nearly l ->
> +	  raise(Error(sp.ppat_loc, Unbound_constructor (lid,l)))
> +      in
>        let sargs =
>          match sarg with
>            None -> []
> @@ -449,7 +454,10 @@ let rec type_pat env sp =
>            try
>              Env.lookup_label lid env
>            with Not_found ->
> -            raise(Error(sp.ppat_loc, Unbound_label lid)) in
> +            raise(Error(sp.ppat_loc, Unbound_label (lid,[])))
> +	    | Ident.Found_nearly l ->
> +	    raise(Error(sp.ppat_loc, Unbound_label (lid,l)))
> +	in
>          begin_def ();
>          let (vars, ty_arg, ty_res) = instance_label false label in
>          if vars = [] then end_def ();
> @@ -806,7 +814,7 @@ let rec approx_type env sty =
>          if List.length ctl <> decl.type_arity then raise Not_found;
>          let tyl = List.map (approx_type env) ctl in
>          newconstr path tyl
> -      with Not_found -> newvar ()
> +      with Not_found | Ident.Found_nearly _ -> newvar ()
>        end
>    | _ -> newvar ()
>
> @@ -923,7 +931,9 @@ let rec type_exp env sexp =
>            exp_type = instance desc.val_type;
>            exp_env = env }
>        with Not_found ->
> -        raise(Error(sexp.pexp_loc, Unbound_value lid))
> +        raise(Error(sexp.pexp_loc, Unbound_value (lid,[])))
> +	| Ident.Found_nearly l ->
> +	raise(Error(sexp.pexp_loc, Unbound_value (lid,l)))
>        end
>    | Pexp_constant cst ->
>        re {
> @@ -1021,7 +1031,10 @@ let rec type_exp env sexp =
>            try
>              Env.lookup_label lid env
>            with Not_found ->
> -            raise(Error(sexp.pexp_loc, Unbound_label lid)) in
> +            raise(Error(sexp.pexp_loc, Unbound_label (lid,[])))
> +	    | Ident.Found_nearly l ->
> +	    raise(Error(sexp.pexp_loc, Unbound_label (lid,l)))
> +	in
>          begin_def ();
>          if !Clflags.principal then begin_def ();
>          let (vars, ty_arg, ty_res) = instance_label true label in
> @@ -1098,7 +1111,10 @@ let rec type_exp env sexp =
>          try
>            Env.lookup_label lid env
>          with Not_found ->
> -          raise(Error(sexp.pexp_loc, Unbound_label lid)) in
> +          raise(Error(sexp.pexp_loc, Unbound_label (lid,[])))
> +	  | Ident.Found_nearly l ->
> +	  raise(Error(sexp.pexp_loc, Unbound_label (lid,l)))
> +      in
>        let (_, ty_arg, ty_res) = instance_label false label in
>        unify_exp env arg ty_res;
>        re {
> @@ -1112,7 +1128,10 @@ let rec type_exp env sexp =
>          try
>            Env.lookup_label lid env
>          with Not_found ->
> -          raise(Error(sexp.pexp_loc, Unbound_label lid)) in
> +          raise(Error(sexp.pexp_loc, Unbound_label (lid,[])))
> +	  | Ident.Found_nearly l ->
> +	  raise(Error(sexp.pexp_loc, Unbound_label (lid,l)))
> +      in
>        if label.lbl_mut = Immutable then
>          raise(Error(sexp.pexp_loc, Label_not_mutable lid));
>        begin_def ();
> @@ -1332,7 +1351,9 @@ let rec type_exp env sexp =
>    | Pexp_new cl ->
>        let (cl_path, cl_decl) =
>          try Env.lookup_class cl env with Not_found ->
> -          raise(Error(sexp.pexp_loc, Unbound_class cl))
> +          raise(Error(sexp.pexp_loc, Unbound_class (cl,[])))
> +	  | Ident.Found_nearly l ->
> +	  raise(Error(sexp.pexp_loc, Unbound_class (cl,l)))
>        in
>          begin match cl_decl.cty_new with
>            None ->
> @@ -1361,10 +1382,13 @@ let rec type_exp env sexp =
>          | Val_ivar _ ->
>              raise(Error(sexp.pexp_loc,  
> Instance_variable_not_mutable lab))
>          | _ ->
> -            raise(Error(sexp.pexp_loc, Unbound_instance_variable  
> lab))
> +            raise(Error(sexp.pexp_loc, Unbound_instance_variable  
> (lab,[])))
>        with
>          Not_found ->
> -          raise(Error(sexp.pexp_loc, Unbound_instance_variable lab))
> +          raise(Error(sexp.pexp_loc, Unbound_instance_variable  
> (lab,[])))
> +	| Ident.Found_nearly l ->
> +	  raise(Error(sexp.pexp_loc, Unbound_instance_variable (lab,l)))
> +
>        end
>    | Pexp_override lst ->
>        let _ =
> @@ -1380,7 +1404,7 @@ let rec type_exp env sexp =
>          try
>            Env.lookup_value (Longident.Lident "selfpat-*") env,
>            Env.lookup_value (Longident.Lident "self-*") env
> -        with Not_found ->
> +        with Not_found | Ident.Found_nearly _ ->
>            raise(Error(sexp.pexp_loc, Outside_class))
>        with
>          (_, {val_type = self_ty; val_kind = Val_self (_, vars, _,  
> _)}),
> @@ -1391,7 +1415,9 @@ let rec type_exp env sexp =
>                (Path.Pident id, type_expect env snewval (instance ty))
>              with
>                Not_found ->
> -                raise(Error(sexp.pexp_loc,  
> Unbound_instance_variable lab))
> +                raise(Error(sexp.pexp_loc,  
> Unbound_instance_variable (lab,[])))
> +	      | Ident.Found_nearly l ->
> +		raise(Error(sexp.pexp_loc, Unbound_instance_variable (lab,l)))
>              end
>            in
>            let modifs = List.map type_override lst in
> @@ -1637,7 +1663,7 @@ and type_application env funct sargs =
>                    may_warn sarg0.pexp_loc
>                      (Warnings.Not_principal "commuting this  
> argument");
>                  (l', sarg0, sargs1 @ sargs2, more_sargs)
> -              with Not_found ->
> +              with Not_found | Ident.Found_nearly _ ->
>                  let (l', sarg0, sargs1, sargs2) =
>                    extract_label name more_sargs in
>                  if sargs1 <> [] || sargs <> [] then
> @@ -1707,7 +1733,9 @@ and type_construct env loc lid sarg  
> explicit_arity ty_expected =
>      try
>        Env.lookup_constructor lid env
>      with Not_found ->
> -      raise(Error(loc, Unbound_constructor lid)) in
> +      raise(Error(loc, Unbound_constructor (lid,[])))
> +      | Ident.Found_nearly l ->
> +      raise(Error(loc, Unbound_constructor (lid,l))) in
>    let sargs =
>      match sarg with
>        None -> []
> @@ -2020,12 +2048,18 @@ open Format
>  open Printtyp
>
>  let report_error ppf = function
> -  | Unbound_value lid ->
> -      fprintf ppf "Unbound value %a" longident lid
> -  | Unbound_constructor lid ->
> +  | Unbound_value (lid,[]) ->
> +      fprintf ppf "Unbound value %a" longident lid
> +  | Unbound_value (lid,corr::_) ->
> +      fprintf ppf "Unbound value %a, possible misspelling of %s"  
> longident lid corr
> +  | Unbound_constructor (lid,[]) ->
>        fprintf ppf "Unbound constructor %a" longident lid
> -  | Unbound_label lid ->
> +  | Unbound_constructor (lid,corr::_) ->
> +      fprintf ppf "Unbound constructor %a, possible misspelling of  
> %s" longident lid corr
> +  | Unbound_label (lid,[]) ->
>        fprintf ppf "Unbound record field label %a" longident lid
> +  | Unbound_label (lid,corr::_) ->
> +      fprintf ppf "Unbound record field label %a, possible  
> misspelling of %s" longident lid corr
>    | Polymorphic_label lid ->
>        fprintf ppf "@[The record field label %a is polymorphic.@ %s@]"
>          longident lid "You cannot instantiate it in a pattern."
> @@ -2100,13 +2134,17 @@ let report_error ppf = function
>           It has no method %s@]" type_expr ty me
>    | Undefined_inherited_method me ->
>        fprintf ppf "This expression has no method %s" me
> -  | Unbound_class cl ->
> +  | Unbound_class (cl,[]) ->
>        fprintf ppf "Unbound class %a" longident cl
> +  | Unbound_class (cl,corr::_) ->
> +      fprintf ppf "Unbound class %a, possible misspelling of %s"  
> longident cl corr
>    | Virtual_class cl ->
>        fprintf ppf "One cannot create instances of the virtual  
> class %a"
>          longident cl
> -  | Unbound_instance_variable v ->
> +  | Unbound_instance_variable (v,[]) ->
>        fprintf ppf "Unbound instance variable %s" v
> +  | Unbound_instance_variable (v,corr::_) ->
> +      fprintf ppf "Unbound instance variable %s, possible  
> misspelling of %s" v corr
>    | Instance_variable_not_mutable v ->
>        fprintf ppf "The instance variable %s is not mutable" v
>    | Not_subtype(tr1, tr2) ->
> diff --git a/typing/typecore.mli b/typing/typecore.mli
> index 24aea7d..171e96e 100644
> --- a/typing/typecore.mli
> +++ b/typing/typecore.mli
> @@ -61,9 +61,9 @@ val force_delayed_checks: unit -> unit
>  val self_coercion : (Path.t * Location.t list ref) list ref
>
>  type error =
> -    Unbound_value of Longident.t
> -  | Unbound_constructor of Longident.t
> -  | Unbound_label of Longident.t
> +    Unbound_value of Longident.t * string list
> +  | Unbound_constructor of Longident.t * string list
> +  | Unbound_label of Longident.t * string list
>    | Polymorphic_label of Longident.t
>    | Constructor_arity_mismatch of Longident.t * int * int
>    | Label_mismatch of Longident.t * (type_expr * type_expr) list
> @@ -80,11 +80,11 @@ type error =
>    | Bad_conversion of string * int * char
>    | Undefined_method of type_expr * string
>    | Undefined_inherited_method of string
> -  | Unbound_class of Longident.t
> +  | Unbound_class of Longident.t * string list
>    | Virtual_class of Longident.t
>    | Private_type of type_expr
>    | Private_label of Longident.t * type_expr
> -  | Unbound_instance_variable of string
> +  | Unbound_instance_variable of string * string list
>    | Instance_variable_not_mutable of string
>    | Not_subtype of (type_expr * type_expr) list * (type_expr *  
> type_expr) list
>    | Outside_class
> _______________________________________________
> Caml-list mailing list. Subscription management:
> http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
> Archives: http://caml.inria.fr
> Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
> Bug reports: http://caml.inria.fr/bin/caml-bugs

--------------------------------------------------
Yitzhak Mandelbaum
AT&T Labs - Research

http://www.research.att.com/~yitzhak