diff --git a/src/dune_project.ml b/src/dune_project.ml index 52b149a0..a24003ab 100644 --- a/src/dune_project.ml +++ b/src/dune_project.ml @@ -149,8 +149,7 @@ module Lang = struct in let ver = Sexp.Of_sexp.parse Syntax.Version.t Univ_map.empty - (Atom (ver_loc, Sexp.Atom.of_string ver)) - in + (Atom (ver_loc, Sexp.Atom.of_string ver)) in match Hashtbl.find langs name with | None -> Loc.fail name_loc "Unknown language %S.%s" name diff --git a/src/stdune/sexp.ml b/src/stdune/sexp.ml index 69dad943..e945cd85 100644 --- a/src/stdune/sexp.ml +++ b/src/stdune/sexp.ml @@ -298,13 +298,13 @@ module Of_sexp = struct basic "Integer" (fun s -> match int_of_string s with | x -> Ok x - | exception _ -> Error ()) + | exception _ -> Result.Error ()) let float = basic "Float" (fun s -> match float_of_string s with | x -> Ok x - | exception _ -> Error ()) + | exception _ -> Result.Error ()) let pair a b = enter @@ -333,7 +333,7 @@ module Of_sexp = struct let string_map t = list (pair string t) >>= fun bindings -> match String.Map.of_list bindings with - | Ok x -> return x + | Result.Ok x -> return x | Error (key, _v1, _v2) -> loc >>= fun loc -> of_sexp_errorf loc "key %s present multiple times" key diff --git a/src/usexp/atom.ml b/src/usexp/atom.ml new file mode 100644 index 00000000..307857ce --- /dev/null +++ b/src/usexp/atom.ml @@ -0,0 +1,50 @@ +type t = A of string [@@unboxed] + +let invalid_argf fmt = Printf.ksprintf invalid_arg fmt + +type syntax = Jbuild | Dune + +let is_valid_dune = + let rec loop s i len = + i = len || + match String.unsafe_get s i with + | '%' | '"' | '(' | ')' | ';' | '\000'..'\032' | '\127'..'\255' -> false + | _ -> loop s (i + 1) len + in + fun s -> + let len = String.length s in + len > 0 && loop s 0 len + +let is_valid_jbuild str = + let len = String.length str in + len > 0 && + let rec loop ix = + match str.[ix] with + | '"' | '(' | ')' | ';' -> true + | '|' -> ix > 0 && let next = ix - 1 in str.[next] = '#' || loop next + | '#' -> ix > 0 && let next = ix - 1 in str.[next] = '|' || loop next + | ' ' | '\t' | '\n' | '\012' | '\r' -> true + | _ -> ix > 0 && loop (ix - 1) + in + not (loop (len - 1)) + +let of_string s = A s +let to_string (A s) = s + +let is_valid (A t) = function + | Jbuild -> is_valid_jbuild t + | Dune -> is_valid_dune t + +let print ((A s) as t) syntax = + if is_valid t syntax then + s + else + match syntax with + | Jbuild -> invalid_argf "atom '%s' cannot be printed in jbuild syntax" s + | Dune -> invalid_argf "atom '%s' cannot be in dune syntax" s + +let of_int i = of_string (string_of_int i) +let of_float x = of_string (string_of_float x) +let of_bool x = of_string (string_of_bool x) +let of_digest d = of_string (Digest.to_hex d) +let of_int64 i = of_string (Int64.to_string i) diff --git a/src/usexp/atom.mli b/src/usexp/atom.mli new file mode 100644 index 00000000..9b63bc49 --- /dev/null +++ b/src/usexp/atom.mli @@ -0,0 +1,17 @@ +type t = private A of string [@@unboxed] + +type syntax = Jbuild | Dune + +val is_valid_dune : string -> bool +val is_valid : t -> syntax -> bool + +val of_string : string -> t +val to_string : t -> string + +val print : t -> syntax -> string + +val of_int : int -> t +val of_float : float -> t +val of_bool : bool -> t +val of_int64 : Int64.t -> t +val of_digest : Digest.t -> t diff --git a/src/usexp/lexer.mli b/src/usexp/lexer.mli index 5015e1ad..22407e45 100644 --- a/src/usexp/lexer.mli +++ b/src/usexp/lexer.mli @@ -1,7 +1,3 @@ -module Atom : sig - type t = A of string [@@unboxed] -end - module Token : sig type t = | Atom of Atom.t diff --git a/src/usexp/lexer.mll b/src/usexp/lexer.mll index 69912d31..2c9098b0 100644 --- a/src/usexp/lexer.mll +++ b/src/usexp/lexer.mll @@ -1,8 +1,4 @@ { -module Atom = struct - type t = A of string [@@unboxed] -end - module Token = struct type t = | Atom of Atom.t @@ -81,7 +77,7 @@ let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] let atom_char_jbuild = [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012'] let atom_char_dune = - [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] + [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] (* rule for jbuild files *) rule jbuild_token = parse @@ -128,7 +124,7 @@ and jbuild_atom acc start = parse error lexbuf "Internal error in the S-expression parser, \ please report upstream."; lexbuf.lex_start_p <- start; - Token.Atom (A acc) + Token.Atom (Atom.of_string acc) } and quoted_string mode = parse @@ -248,7 +244,8 @@ and token = parse Quoted_string s } | atom_char_dune+ as s - { Token.Atom (A s) } + { Token.Atom (Atom.of_string s) } + | _ as c { error lexbuf (Printf.sprintf "Invalid atom character '%c'" c) } | eof { Eof } diff --git a/src/usexp/usexp.ml b/src/usexp/usexp.ml index 48a3dd32..e7d4a1e1 100644 --- a/src/usexp/usexp.ml +++ b/src/usexp/usexp.ml @@ -10,34 +10,7 @@ module Bytes = struct UnlabeledBytes.blit_string src src_pos dst dst_pos len end -module Atom = struct - type t = Lexer.Atom.t = A of string [@@unboxed] - - let is_valid = - let rec loop s i len = - i = len || - match String.unsafe_get s i with - | '"' | '(' | ')' | ';' | '\000'..'\032' | '\127'..'\255' -> false - | _ -> loop s (i + 1) len - in - fun s -> - let len = String.length s in - len > 0 && loop s 0 len - - (* XXX eventually we want to report a nice error message to the user - at the point the conversion is made. *) - let of_string s = - if is_valid s then A s - else invalid_arg(Printf.sprintf "Usexp.Atom.of_string: %S" s) - - let of_int i = A (string_of_int i) - let of_float x = A (string_of_float x) - let of_bool x = A (string_of_bool x) - let of_int64 i = A (Int64.to_string i) - let of_digest d = A (Digest.to_hex d) - - let to_string (A s) = s -end +module Atom = Atom type t = | Atom of Atom.t @@ -46,15 +19,15 @@ type t = type sexp = t -let atom s = - if Atom.is_valid s then Atom (A s) - else invalid_arg "Usexp.atom" +let atom s = Atom (Atom.of_string s) -let unsafe_atom_of_string s = Atom(A s) +let unsafe_atom_of_string s = atom s let atom_or_quoted_string s = - if Atom.is_valid s then Atom (A s) - else Quoted_string s + if Atom.is_valid_dune s then + Atom (Atom.of_string s) + else + Quoted_string s let quote_length s = let n = ref 0 in @@ -117,13 +90,13 @@ let quoted s = Bytes.unsafe_to_string s' let rec to_string = function - | Atom (A s) -> s + | Atom a -> Atom.print a Atom.Dune | Quoted_string s -> quoted s | List l -> Printf.sprintf "(%s)" (List.map l ~f:to_string |> String.concat ~sep:" ") let rec pp ppf = function - | Atom (A s) -> - Format.pp_print_string ppf s + | Atom s -> + Format.pp_print_string ppf (Atom.print s Atom.Dune) | Quoted_string s -> Format.pp_print_string ppf (quoted s) | List [] -> @@ -164,7 +137,7 @@ let pp_print_quoted_string ppf s = Format.pp_print_string ppf (quoted s) let rec pp_split_strings ppf = function - | Atom (A s) -> Format.pp_print_string ppf s + | Atom s -> Format.pp_print_string ppf (Atom.print s Atom.Dune) | Quoted_string s -> pp_print_quoted_string ppf s | List [] -> Format.pp_print_string ppf "()" @@ -249,8 +222,10 @@ module Ast = struct | List of Loc.t * t list let atom_or_quoted_string loc s = - if Atom.is_valid s then Atom (loc, A s) - else Quoted_string (loc, s) + match atom_or_quoted_string s with + | Atom a -> Atom (loc, a) + | Quoted_string s -> Quoted_string (loc, s) + | List _ -> assert false let loc (Atom (loc, _) | Quoted_string (loc, _) | List (loc, _)) = loc diff --git a/src/usexp/usexp.mli b/src/usexp/usexp.mli index 6ade200b..79792ced 100644 --- a/src/usexp/usexp.mli +++ b/src/usexp/usexp.mli @@ -4,23 +4,19 @@ module Atom : sig type t = private A of string [@@unboxed] - (** Acceptable atoms are composed of chars in the range ['!' .. '~'] excluding - [' ' '"' '(' ')' ';' '\\'], and must be nonempty. *) - val is_valid : string -> bool - (** [is_valid s] checks that [s] respects the constraints to be an atom. *) + type syntax = Jbuild | Dune - val of_string : string -> t - (** Convert a string to an atom. If the string contains invalid - characters, raise [Invalid_argument]. *) + val is_valid : t -> syntax -> bool + + val of_string : string -> t + val to_string : t -> string val of_int : int -> t val of_float : float -> t val of_bool : bool -> t val of_int64 : Int64.t -> t val of_digest : Digest.t -> t - - val to_string : t -> string end module Loc : sig diff --git a/test/unit-tests/sexp.mlt b/test/unit-tests/sexp.mlt index 67e516e1..79db5495 100644 --- a/test/unit-tests/sexp.mlt +++ b/test/unit-tests/sexp.mlt @@ -2,9 +2,21 @@ open Stdune;; open Sexp.Of_sexp;; -let pp_sexp_ast ppf sexp = - Sexp.pp ppf (Sexp.Ast.remove_locs sexp) +let pp_sexp_ast = + let rec subst_atoms ~f (s : Sexp.t) = + match s with + | Atom a -> f a + | Quoted_string _ -> s + | List xs -> List (List.map ~f:(subst_atoms ~f) xs) + in + fun ppf sexp -> + sexp + |> Sexp.Ast.remove_locs + |> subst_atoms ~f:(fun (A s) -> + List [(Sexp.atom "atom"); Sexp.atom_or_quoted_string s]) + |> Sexp.pp ppf ;; + #install_printer pp_sexp_ast;; [%%expect{| val pp_sexp_ast : Format.formatter -> Stdune.Sexp.Ast.t -> unit = @@ -20,7 +32,7 @@ let sexp = Sexp.parse_string ~fname:"" ~mode:Single {| (foo 2)) |} [%%expect{| -val sexp : Usexp.Ast.t = ((foo 1) (foo 2)) +val sexp : Usexp.Ast.t = (((atom foo) (atom 1)) ((atom foo) (atom 2))) |}] let of_sexp = record (field "foo" int) @@ -52,8 +64,9 @@ let parse s = let f ~lexer = try Ok (Sexp.parse_string ~fname:"" ~mode:Many ~lexer s) - with Sexp.Parse_error e -> - Error (Sexp.Parse_error.message e) + with + | Sexp.Parse_error e -> Error (Sexp.Parse_error.message e) + | Invalid_argument e -> Error e in let jbuild = f ~lexer:Sexp.Lexer.jbuild_token in let dune = f ~lexer:Sexp.Lexer.token in @@ -74,26 +87,34 @@ val parse : string -> parse_result = parse {| # ## x##y x||y a#b|c#d copy# |} [%%expect{| -- : parse_result = Same (Ok [#; ##; x##y; x||y; a#b|c#d; copy#]) +- : parse_result = +Same + (Ok + [(atom #); (atom ##); (atom x##y); (atom x||y); (atom a#b|c#d); + (atom copy#)]) |}] parse {|x #| comment |# y|} [%%expect{| - : parse_result = -Different {jbuild = Ok [x; y]; dune = Ok [x; #|; comment; |#; y]} +Different + {jbuild = Ok [(atom x); (atom y)]; + dune = Ok [(atom x); (atom #|); (atom comment); (atom |#); (atom y)]} |}] parse {|x#|y|} [%%expect{| - : parse_result = -Different {jbuild = Error "jbuild_atoms cannot contain #|"; dune = Ok [x#|y]} +Different + {jbuild = Error "jbuild_atoms cannot contain #|"; dune = Ok [(atom x#|y)]} |}] parse {|x|#y|} [%%expect{| - : parse_result = -Different {jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [x|#y]} +Different + {jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [(atom x|#y)]} |}] parse {|"\a"|} @@ -138,27 +159,38 @@ parse {|"$bar%foo%"|} - : parse_result = Same (Ok ["$bar%foo%"]) |}] -parse {|\%{foo}|} -[%%expect{| -- : parse_result = Same (Ok [\%{foo}]) -|}] - parse {|\${foo}|} [%%expect{| -- : parse_result = Same (Ok [\${foo}]) +- : parse_result = Same (Ok [(atom \${foo})]) +|}] + +parse {|\%{foo}|} +[%%expect{| +- : parse_result = +Different + {jbuild = Ok [(atom "\\%{foo}")]; dune = Error "Invalid atom character '%'"} |}] parse {|\$bar%foo%|} [%%expect{| -- : parse_result = Same (Ok [\$bar%foo%]) +- : parse_result = +Different + {jbuild = Ok [(atom "\\$bar%foo%")]; + dune = Error "Invalid atom character '%'"} |}] parse {|\$bar\%foo%|} [%%expect{| -- : parse_result = Same (Ok [\$bar\%foo%]) +- : parse_result = +Different + {jbuild = Ok [(atom "\\$bar\\%foo%")]; + dune = Error "Invalid atom character '%'"} |}] parse {|\$bar\%foo%{bar}|} [%%expect{| -- : parse_result = Same (Ok [\$bar\%foo%{bar}]) +- : parse_result = +Different + {jbuild = Ok [(atom "\\$bar\\%foo%{bar}")]; + dune = Error "Invalid atom character '%'"} |}] diff --git a/test/unit-tests/sexp_tests.ml b/test/unit-tests/sexp_tests.ml index 4fd2bc3a..adec99fe 100644 --- a/test/unit-tests/sexp_tests.ml +++ b/test/unit-tests/sexp_tests.ml @@ -4,42 +4,58 @@ let () = Printexc.record_backtrace true (* Test that all strings of length <= 3 such that [Usexp.Atom.is_valid s] are recignized as atoms by the parser *) + +let string_of_syntax (x : Usexp.Atom.syntax) = + match x with + | Dune -> "dune" + | Jbuild -> "jbuild" + let () = - for len = 0 to 3 do - let s = Bytes.create len in - for i = 0 to 1 lsl (len * 8) - 1 do - if len > 0 then Bytes.set s 0 (Char.chr ( i land 0xff)); - if len > 1 then Bytes.set s 1 (Char.chr ((i lsr 4) land 0xff)); - if len > 2 then Bytes.set s 2 (Char.chr ((i lsr 8) land 0xff)); - let s = Bytes.unsafe_to_string s in - let parser_recognizes_as_atom = - match Usexp.parse_string ~fname:"" ~mode:Single s with - | exception _ -> false - | Atom (_, A s') -> s = s' - | _ -> false - in - let printed_as_atom = - match Usexp.atom_or_quoted_string s with - | Atom _ -> true - | _ -> false - in - if Usexp.Atom.is_valid s <> parser_recognizes_as_atom then begin - Printf.eprintf - "Usexp.Atom.is_valid error:\n\ - - s = %S\n\ - - Usexp.Atom.is_valid s = %B\n\ - - parser_recognizes_as_atom = %B\n" - s (Usexp.Atom.is_valid s) parser_recognizes_as_atom; - exit 1 - end; - if printed_as_atom && not parser_recognizes_as_atom then begin - Printf.eprintf - "Usexp.Atom.atom_or_quoted_string error:\n\ - - s = %S\n\ - - printed_as_atom = %B\n\ - - parser_recognizes_as_atom = %B\n" - s printed_as_atom parser_recognizes_as_atom; - exit 1 - end + [ Usexp.Atom.Dune, Usexp.Lexer.token, (fun s -> Usexp.Atom.is_valid s Dune) + ; Jbuild, Usexp.Lexer.jbuild_token, (fun s -> Usexp.Atom.is_valid s Jbuild) + ] + |> List.iter ~f:(fun (syntax, lexer, validator) -> + for len = 0 to 3 do + let s = Bytes.create len in + for i = 0 to 1 lsl (len * 8) - 1 do + if len > 0 then Bytes.set s 0 (Char.chr ( i land 0xff)); + if len > 1 then Bytes.set s 1 (Char.chr ((i lsr 4) land 0xff)); + if len > 2 then Bytes.set s 2 (Char.chr ((i lsr 8) land 0xff)); + let s = Bytes.unsafe_to_string s in + let parser_recognizes_as_atom = + match Usexp.parse_string ~lexer ~fname:"" ~mode:Single s with + | exception _ -> false + | Atom (_, A s') -> s = s' + | _ -> false + in + let printed_as_atom = + match Usexp.atom_or_quoted_string s with + | Atom _ -> true + | _ -> false + in + let valid_dune_atom = validator (Usexp.Atom.of_string s) in + if valid_dune_atom <> parser_recognizes_as_atom then begin + Printf.eprintf + "Usexp.Atom.is_valid error:\n\ + - syntax = %s\n\ + - s = %S\n\ + - Usexp.Atom.is_valid s = %B\n\ + - parser_recognizes_as_atom = %B\n" + (string_of_syntax syntax) s valid_dune_atom + parser_recognizes_as_atom; + exit 1 + end; + if printed_as_atom && not parser_recognizes_as_atom then begin + Printf.eprintf + "Usexp.Atom.atom_or_quoted_string error:\n\ + - syntax = %s\n\ + - s = %S\n\ + - printed_as_atom = %B\n\ + - parser_recognizes_as_atom = %B\n" + (string_of_syntax syntax) s printed_as_atom + parser_recognizes_as_atom; + exit 1 + end + done done - done + )