Simplify the parser

Replace the current generated parser by an ocamllex lexer + a simple
parser.

The new code is:
- much simpler and smaller
- shouldn't cause ocamlopt to stack overflow anymore on BSD systems
- slightly slower but not that much

Signed-off-by: Jeremie Dimino <jdimino@janestreet.com>
This commit is contained in:
Jeremie Dimino 2018-05-30 16:16:22 +01:00 committed by Jérémie Dimino
parent 115ee93dd6
commit 39e74826f4
22 changed files with 410 additions and 2556 deletions

View File

@ -54,9 +54,6 @@ livedoc:
update-jbuilds: $(BIN)
$(BIN) build --dev @doc/runtest --auto-promote
update-sexp-parser:
$(BIN) build --dev @update-sexp-parser --auto-promote
.PHONY: default install uninstall reinstall clean test doc
.PHONY: promote accept-corrections opam-release

View File

@ -12,7 +12,7 @@
ocaml_config))
(synopsis "Internal Dune library, do not use!")))
(ocamllex (meta_lexer glob_lexer))
(ocamllex (meta_lexer glob_lexer dune_lexer))
(rule
((targets (setup.ml))

3
src/dune_lexer.mll Normal file
View File

@ -0,0 +1,3 @@
rule is_script = parse
| "(* -*- tuareg -*- *)" { true }
| "" { false }

View File

@ -16,9 +16,6 @@ module Dune_file = struct
| Plain x -> x.path
| Ocaml_script p -> p
let ocaml_script_prefix = "(* -*- tuareg -*- *)"
let ocaml_script_prefix_len = String.length ocaml_script_prefix
let extract_ignored_subdirs =
let stanza =
let open Sexp.Of_sexp in
@ -50,39 +47,14 @@ module Dune_file = struct
in
(ignored_subdirs, sexps)
let load file =
Io.with_file_in file ~f:(fun ic ->
let open Sexp in
let state = Parser.create ~fname:(Path.to_string file) ~mode:Many in
let buf = Bytes.create Io.buf_len in
let rec loop stack =
match input ic buf 0 Io.buf_len with
| 0 -> stack
| n -> loop (Parser.feed_subbytes state buf ~pos:0 ~len:n stack)
in
let finish stack =
let sexps = Parser.feed_eoi state stack in
let load ?lexer file =
Io.with_lexbuf_from_file file ~f:(fun lb ->
if Dune_lexer.is_script lb then
(Ocaml_script file, String.Set.empty)
else
let sexps = Usexp.Parser.parse lb ?lexer ~mode:Many in
let ignored_subdirs, sexps = extract_ignored_subdirs sexps in
(Plain { path = file; sexps },
ignored_subdirs)
in
let rec loop0 stack i =
match input ic buf i (Io.buf_len - i) with
| 0 ->
finish (Parser.feed_subbytes state buf ~pos:0 ~len:i stack)
| n ->
let i = i + n in
if i < ocaml_script_prefix_len then
loop0 stack i
else if Bytes.sub_string buf 0 ocaml_script_prefix_len
[@warning "-6"]
= ocaml_script_prefix then
(Ocaml_script file, String.Set.empty)
else
let stack = Parser.feed_subbytes state buf ~pos:0 ~len:i stack in
finish (loop stack)
in
loop0 Parser.Stack.empty 0)
(Plain { path = file; sexps }, ignored_subdirs))
end
let load_jbuild_ignore path =

View File

@ -46,11 +46,17 @@ let report_with_backtrace exn =
hint on candidates)
in
{ p with loc = Some loc; pp }
| Usexp.Parser.Error e ->
let pos = Usexp.Parser.Error.position e in
let msg = Usexp.Parser.Error.message e in
let pos = { pos with pos_fname = !map_fname pos.pos_fname } in
let loc = { Loc. start = pos; stop = pos } in
| Sexp.Parse_error e ->
let loc = Sexp.Parse_error.loc e in
let msg = Sexp.Parse_error.message e in
let map_pos (pos : Lexing.position) =
{ pos with pos_fname = !map_fname pos.pos_fname }
in
let loc : Loc.t =
{ start = map_pos loc.start
; stop = map_pos loc.stop
}
in
{ p with
loc = Some loc
; pp = fun ppf -> Format.fprintf ppf "@{<error>Error@}: %s\n" msg

View File

@ -84,19 +84,11 @@ let buf_len = 65_536
module Sexp = struct
open Sexp
let load path ~mode =
with_file_in path ~f:(fun ic ->
let state = Parser.create ~fname:(Path.to_string path) ~mode in
let buf = Bytes.create buf_len in
let rec loop stack =
match input ic buf 0 buf_len with
| 0 -> Parser.feed_eoi state stack
| n -> loop (Parser.feed_subbytes state buf ~pos:0 ~len:n stack)
in
loop Parser.Stack.empty)
let load ?lexer path ~mode =
with_lexbuf_from_file path ~f:(Usexp.Parser.parse ~mode ?lexer)
let load_many_as_one path =
match load path ~mode:Many with
let load_many_as_one ?lexer path =
match load ?lexer path ~mode:Many with
| [] -> Ast.List (Loc.in_file (Path.to_string path), [])
| x :: l ->
let last = Option.value (List.last l) ~default:x in

View File

@ -27,8 +27,8 @@ val copy_file : src:Path.t -> dst:Path.t -> unit
val read_all : in_channel -> string
module Sexp : sig
val load : Path.t -> mode:'a Sexp.Parser.Mode.t -> 'a
val load_many_as_one : Path.t -> Sexp.Ast.t
val load : ?lexer:Usexp.Lexer.t -> Path.t -> mode:'a Sexp.Parser.Mode.t -> 'a
val load_many_as_one : ?lexer:Usexp.Lexer.t -> Path.t -> Sexp.Ast.t
end
(**/**)

View File

@ -5,9 +5,4 @@
(synopsis "[Internal] S-expression library")
(public_name dune.usexp)))
(rule
(with-stdout-to table.ml.gen (run gen/gen_parser_automaton.exe)))
(alias
((name update-sexp-parser)
(action (diff table.ml table.ml.gen))))
(ocamllex (lexer))

View File

@ -1,4 +0,0 @@
(executable
((name gen_parser_automaton)
(libraries (base stdio gen_parsexp_lib))
(preprocess (pps (ppx_sexp_conv ppx_compare)))))

View File

@ -1,112 +0,0 @@
(* Parsing of S-expression. The parsing is written as an automaton for which
we provide different implementations of actions.
*)
open Base
open Stdio
open Gen_parsexp_lib.Automaton
open Gen_parsexp_lib.Automaton.Table
(* Sharing of transitions *)
module Sharing = struct
let create_assign_id () =
let cache = Hashtbl.Poly.create () in
(cache,
fun x ->
if not (Hashtbl.mem cache x) then
Hashtbl.add_exn cache ~key:x ~data:(Hashtbl.length cache))
let share (table : t) =
let transitions, assign_transition_id =
create_assign_id ()
in
let transitions_eoi, assign_transition_eoi_id =
create_assign_id ()
in
Array.iter table.transitions ~f:assign_transition_id;
Array.iter table.transitions_eoi ~f:assign_transition_eoi_id;
(transitions, transitions_eoi)
end
let gen_code oc (t : t) =
let (named_transitions, named_transitions_eoi) = Sharing.share t in
let pr fmt = Out_channel.fprintf oc (Caml.(^^) fmt "\n") in
pr "(* generated by %s *)" Caml.Sys.argv.(0);
pr "";
pr "open Parser_automaton_internal";
pr "";
pr "let raise = Parser_automaton_internal.Error.raise";
pr "";
let ordered_ids tbl =
Hashtbl.fold tbl ~init:[] ~f:(fun ~key:x ~data:id acc -> (id, x) :: acc)
|> List.sort ~cmp:(fun (id1, _) (id2, _) -> compare id1 id2)
in
List.iter (ordered_ids named_transitions)
~f:(fun (id, tr) ->
match tr with
| Error error ->
pr "let tr_%02d _state _char _stack =" id;
pr " raise _state ~at_eof:false %s" (Error.to_string error)
| Ok { action = (eps_actions, action); goto; advance } ->
let eps_actions =
List.filter_map ~f:Epsilon_action.to_runtime_function eps_actions
in
let action = Action.to_runtime_function action in
pr "let tr_%02d state %schar stack =" id
(if Option.is_none action &&
not ([%compare.equal: goto_state] goto End_block_comment) then
"_"
else
"");
List.iter eps_actions ~f:(pr " let stack = %s state stack in");
(match action with
| None -> ()
| Some s -> pr " let stack = %s state char stack in" s);
(match goto with
| State n -> pr " set_automaton_state state %d;" n
| End_block_comment ->
pr " let stack = end_block_comment state char stack in";
pr " set_automaton_state state \
(if block_comment_depth state <> 0 then %d else %d);"
(State.to_int (Block_comment Normal)) (State.to_int Whitespace));
pr " %s state;"
(match advance with
| Advance -> "advance"
| Advance_eol -> "advance_eol");
pr " stack"
);
pr "";
List.iter (ordered_ids named_transitions_eoi) ~f:(fun (id, tr) ->
match tr with
| Error error ->
pr "let tr_eoi_%02d state _stack =" id;
pr " raise state ~at_eof:true %s" (Error.to_string error)
| Ok eps_actions ->
pr "let tr_eoi_%02d state stack =" id;
let eps_actions =
List.filter_map eps_actions ~f:Epsilon_action.to_runtime_function
in
List.iter eps_actions ~f:(pr " let stack = %s state stack in");
pr " eps_eoi_check state stack");
pr "";
let pr_table ?(per_line=1) suffix tbl ids =
pr "let transitions%s =" suffix;
let len = Array.length tbl in
let lines = len / per_line in
assert (per_line * lines = len);
for l = 0 to lines - 1 do
Out_channel.fprintf oc (if l = 0 then " [|" else " ;");
for col = 0 to per_line - 1 do
if col > 0 then Out_channel.fprintf oc ";";
let i = l * per_line + col in
Out_channel.fprintf oc " tr%s_%02d" suffix (Hashtbl.find_exn ids tbl.(i))
done;
Out_channel.fprintf oc "\n"
done;
pr " |]";
pr "";
in
pr_table "" t.transitions named_transitions ~per_line:8;
pr_table "_eoi" t.transitions_eoi named_transitions_eoi
let () = gen_code Caml.stdout table

View File

@ -1,383 +0,0 @@
(* Abstract version of the parsing automaton.
It is used in two places:
- to define the automaton. At runtime, we only use integer for states and a table of
functions for transitions
- for tests
*)
open Base
module State = struct
module Quoted_string = struct
type t =
| Normal
| After_backslash
| After_backslash_cr
| After_backslash_digit
| After_backslash_2digits
| After_backslash_x
| After_backslash_x_hex
| Ignoring_blanks
[@@deriving enumerate, compare, sexp_of]
end
module Block_comment = struct
type t =
| Normal
| After_pipe
| After_hash
| Quoted_string of Quoted_string.t
[@@deriving enumerate, compare, sexp_of]
end
module Unquoted_string = struct
type t =
| Normal
| After_hash
| After_pipe
[@@deriving enumerate, compare, sexp_of]
end
type t =
| Whitespace
| Error
| After_cr
| Unquoted_string of Unquoted_string.t
| Line_comment
| After_hash
| Quoted_string of Quoted_string.t
| Block_comment of Block_comment.t
[@@deriving enumerate, compare, sexp_of]
let to_int t =
let rec loop i t l =
match l with
| [] -> assert false
| x :: l -> if [%compare.equal: t] t x then i else loop (i + 1) t l
in
loop 0 t all
let of_int i = List.nth_exn all i
let count = List.length all
let initial = to_int Whitespace
let () = assert (initial = 0) (* This is assumed in parser_automaton_internal.ml *)
let () = assert (to_int Error = 1) (* This is assumed in parser_automaton_internal.ml *)
let old_parser_approx_cont_state = function
| Whitespace -> "Parsing_toplevel_whitespace"
| After_cr -> "Parsing_nested_whitespace"
| Unquoted_string _
| Quoted_string _ -> "Parsing_atom"
| After_hash -> "Parsing_atom"
| Block_comment _ -> "Parsing_block_comment"
| Line_comment -> "Parsing_toplevel_whitespace"
(* This cannot happen with the old parser so the result is a dummy value *)
| Error -> "Parsing_toplevel_whitespace"
end
module Error = struct
(* Subset of the [Parser_automaton_internal.Public.Error.Reason.t] type *)
type t =
| Unexpected_char_parsing_hex_escape
| Unexpected_char_parsing_dec_escape
| Unterminated_quoted_string
| Unterminated_block_comment
| Comment_token_in_unquoted_atom
| Unexpected_character_after_cr
| Automaton_in_error_state
[@@deriving compare, sexp_of, hash, variants]
let to_string = Variants.to_name
end
(* Action associated to transitions. Actions correspond to the similarly named functions
in ../parser_automaton_internal.mli. *)
module Action = struct
type t =
| Nop
| Opening
| Closing
| Add_atom_char
| Add_quoted_atom_char
| Add_first_char
| Add_escaped
| Add_hex_escape_char
| Add_dec_escape_char
| Add_last_hex_escape_char
| Add_last_dec_escape_char
| Add_token_char
| Comment_add_last_dec_escape_char
| Push_quoted_atom
| Start_quoted_string
| Start_block_comment
| Start_sexp_comment
| Start_line_comment
[@@deriving compare, sexp_of, hash, variants]
let to_runtime_function = function
| Nop -> None
| t -> Some (String.uncapitalize (Variants.to_name t))
end
(* Action associated to epsilon transitions, i.e. transitions that do not consume a
character.
Having epsilon actions makes the definition of the automaton much simpler. *)
module Epsilon_action = struct
type t =
| Nop
| Push_atom
| Add_first_char_hash
| Add_escaped_cr
| End_line_comment
[@@deriving compare, sexp_of, hash, variants]
let to_runtime_function = function
| Nop -> None
| End_line_comment -> Some "end_line_comment"
| t -> Some ("eps_" ^ String.uncapitalize (Variants.to_name t))
end
module Transition = struct
type t =
| T of Action.t * State.t
| E of Epsilon_action.t * State.t
| Error of Error.t
| End_block_comment (* can't be a normal transition, as the new state isn't known
statically *)
[@@deriving compare]
end
module Final_transition = struct
type t =
| Eoi_check
| E of Epsilon_action.t * State.t
| Error of Error.t
end
module type Automaton = sig
val transition : State.t * char -> Transition.t
val transition_eoi : State.t -> Final_transition.t
end
(* Definition of the automaton, compiled later to a transition table. *)
module Automaton : Automaton = struct
module Quoted_string_transition = struct
type t =
| T of Action.t * State.Quoted_string.t
| E of Epsilon_action.t * State.Quoted_string.t
| Error of Error.t
| End_of_quoted_string
end
type context = In_block_comment | In_atom
let quoted_string_transition
: context -> State.Quoted_string.t * char -> Quoted_string_transition.t
= fun context x ->
(* Distinguising atom and block comments is to optimize block comments. But
we musn't optimize the exception on things like \321. *)
let if_atom then_ else_ : Action.t =
match context with
| In_atom -> then_
| In_block_comment -> else_
in
let if_atom_eps then_ else_ : Epsilon_action.t =
match context with
| In_atom -> then_
| In_block_comment -> else_
in
match x with
| Normal, '"' -> End_of_quoted_string
| Normal, '\\' -> T (Add_token_char, After_backslash)
| Normal, _ -> T (if_atom Add_quoted_atom_char Add_token_char, Normal)
| After_backslash, '\n' ->
T (Add_token_char, Ignoring_blanks)
| After_backslash, '\r' ->
T (Add_token_char, After_backslash_cr)
| After_backslash, 'x' ->
T (Add_token_char, After_backslash_x)
| After_backslash, '0'..'9' ->
T (Add_dec_escape_char, After_backslash_digit)
| After_backslash, _ ->
T (if_atom Add_escaped Add_token_char, Normal)
| After_backslash_cr, '\n' ->
T (Add_token_char, Ignoring_blanks)
| After_backslash_cr, _ ->
E (if_atom_eps Add_escaped_cr Nop, Normal)
| After_backslash_x, ('0'..'9' | 'a'..'f' | 'A'..'F') ->
T (if_atom Add_hex_escape_char Add_token_char , After_backslash_x_hex)
| After_backslash_x, _ ->
Error Unexpected_char_parsing_hex_escape
| After_backslash_x_hex, ('0'..'9' | 'a'..'f' | 'A'..'F') ->
T (if_atom Add_last_hex_escape_char Add_token_char, Normal)
| After_backslash_x_hex, _ ->
Error Unexpected_char_parsing_hex_escape
| After_backslash_digit, '0'..'9' ->
T (Add_dec_escape_char, After_backslash_2digits)
| After_backslash_digit, _ ->
Error Unexpected_char_parsing_dec_escape
| After_backslash_2digits, '0'..'9' ->
T (if_atom Add_last_dec_escape_char Comment_add_last_dec_escape_char, Normal)
| After_backslash_2digits, _ ->
Error Unexpected_char_parsing_dec_escape
| Ignoring_blanks, (' '|'\t') ->
T (Add_token_char, Ignoring_blanks)
| Ignoring_blanks, _ ->
E (Nop, Normal)
module Block_comment_transition = struct
type t =
| T of Action.t * State.Block_comment.t
| E of Epsilon_action.t * State.Block_comment.t
| Error of Error.t
| End_comment
end
let block_comment_transition
: State.Block_comment.t * char -> Block_comment_transition.t
= function
| Quoted_string state, c -> (
match quoted_string_transition In_block_comment (state, c) with
| End_of_quoted_string -> T (Add_token_char, Normal)
| T (action, state) -> T (action, Quoted_string state)
| E (action, state) -> E (action, Quoted_string state)
| Error error -> Error error
)
| After_hash, '|' -> T (Start_block_comment, Normal)
| After_pipe, '#' -> End_comment
| _, '"' -> T (Add_token_char, Quoted_string Normal)
| _, '|' -> T (Add_token_char, After_pipe)
| _, '#' -> T (Add_token_char, After_hash)
| _, _ -> T (Add_token_char, Normal)
let transition : State.t * char -> Transition.t = function
| Whitespace, '(' -> T (Opening, Whitespace)
| Whitespace, ')' -> T (Closing, Whitespace)
| Whitespace, '\r' -> T (Nop, After_cr)
| Whitespace, (' ' | '\t' | '\012' | '\n') -> T (Nop, Whitespace)
| Whitespace, ';' -> T (Start_line_comment, Line_comment)
| Whitespace, '"' -> T (Start_quoted_string, Quoted_string (Normal))
| Whitespace, '#' -> T (Nop, After_hash)
| Whitespace, '|' -> T (Add_first_char, Unquoted_string After_pipe)
| Whitespace, _ -> T (Add_first_char, Unquoted_string Normal)
| After_cr, '\n' -> T (Nop, Whitespace)
| After_cr, _ -> Error Unexpected_character_after_cr
| Unquoted_string _, (';'|'('|')'|'"'|' '|'\t'|'\012'|'\r'|'\n') ->
E (Push_atom, Whitespace)
| Unquoted_string After_hash, '|'
| Unquoted_string After_pipe, '#' -> Error Comment_token_in_unquoted_atom
| Unquoted_string _, '#' -> T (Add_atom_char, Unquoted_string After_hash)
| Unquoted_string _, '|' -> T (Add_atom_char, Unquoted_string After_pipe)
| Unquoted_string _, _ -> T (Add_atom_char, Unquoted_string Normal)
| Line_comment, ('\r' | '\n') -> E (End_line_comment, Whitespace)
| Line_comment, _ -> T (Add_token_char, Line_comment)
| After_hash, ';' -> T (Start_sexp_comment, Whitespace)
| After_hash, '|' -> T (Start_block_comment, Block_comment Normal)
| After_hash, _ -> E (Add_first_char_hash, Unquoted_string Normal)
| Quoted_string state, c -> (
match quoted_string_transition In_atom (state, c) with
| End_of_quoted_string -> T (Push_quoted_atom, Whitespace)
| T (action, state) -> T (action, Quoted_string state)
| E (action, state) -> E (action, Quoted_string state)
| Error error -> Error error
)
| Block_comment state, c -> (
match block_comment_transition (state, c) with
| T (action, state) -> T (action, Block_comment state)
| E (action, state) -> E (action, Block_comment state)
| End_comment -> End_block_comment
| Error error -> Error error
)
| Error, _ -> Error Automaton_in_error_state
let transition_eoi : State.t -> Final_transition.t = function
| Whitespace -> Eoi_check
| After_cr -> Error Unexpected_character_after_cr
| Unquoted_string _ -> E (Push_atom, Whitespace)
| Line_comment -> E (End_line_comment, Whitespace)
| After_hash -> E (Add_first_char_hash, Unquoted_string Normal)
| Quoted_string _ -> Error Unterminated_quoted_string
| Block_comment _ -> Error Unterminated_block_comment
| Error -> Error Automaton_in_error_state
end
module Table = struct
type action = Epsilon_action.t list * Action.t [@@deriving compare, sexp_of, hash]
type goto_state = State of int | End_block_comment [@@deriving compare, sexp_of, hash]
type advance = Advance | Advance_eol [@@deriving compare, sexp_of, hash]
type transition = { action : action; goto : goto_state; advance : advance }
[@@deriving compare, sexp_of, hash]
type 'a or_error =
| Ok of 'a
| Error of Error.t
[@@deriving compare, sexp_of, hash]
type t =
{ transitions : transition or_error array
; transitions_eoi : Epsilon_action.t list or_error array
}
let advance = function
| '\n' -> Advance_eol
| _ -> Advance
let compile (module A : Automaton) =
let rec squash acc state c =
match A.transition (state, c) with
| T (action, state) -> Ok { action = (List.rev acc, action)
; goto = State (State.to_int state)
; advance = advance c
}
| E (action, state) -> squash (action :: acc) state c
| Error error -> Error error
| End_block_comment -> Ok { action = (List.rev acc, Nop)
; goto = End_block_comment
; advance = advance c
}
in
let rec squash_eoi acc state =
match A.transition_eoi state with
| Eoi_check -> Ok (List.rev acc)
| E (eps_action, state) -> squash_eoi (eps_action :: acc) state
| Error error -> Error error
in
let transitions =
Array.create ~len:(State.count * 256)
(Ok { action = ([], Action.Nop)
; goto = State 0
; advance = Advance
})
in
let transitions_eoi = Array.create (Ok []) ~len:State.count in
for s = 0 to State.count - 1 do
let state = State.of_int s in
for c = 0 to 255 do
transitions.(s * 256 + c) <- squash [] state (Char.of_int_exn c);
done;
transitions_eoi.(s) <- squash_eoi [] state
done;
{ transitions; transitions_eoi }
end
let table = Table.compile (module Automaton)

View File

@ -1,10 +0,0 @@
(library
((name gen_parsexp_lib)
(libraries (base))
(preprocess (pps (ppx_sexp_conv
ppx_compare
ppx_enumerate
ppx_hash
ppx_variants_conv)))))
(jbuild_version 1)

27
src/usexp/lexer.mli Normal file
View File

@ -0,0 +1,27 @@
module Atom : sig
type t = A of string [@@unboxed]
end
module Token : sig
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
val token : t
module Error : sig
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
exception Error of Error.t

208
src/usexp/lexer.mll Normal file
View File

@ -0,0 +1,208 @@
{
module Atom = struct
type t = A of string [@@unboxed]
end
module Token = struct
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error = struct
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
exception Error of Error.t
let error ?(delta=0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
raise
(Error { start = { start with pos_cnum = start.pos_cnum + delta }
; stop = Lexing.lexeme_end_p lexbuf
; message
})
let eval_decimal_char c = Char.code c - Char.code '0'
let eval_decimal_escape c1 c2 c3 =
(eval_decimal_char c1) * 100 +
(eval_decimal_char c2) * 10 +
(eval_decimal_char c3)
let eval_hex_char c =
match c with
| '0'..'9' -> Char.code c - Char.code '0'
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
| 'A'..'F' -> Char.code c - Char.code 'A' + 10
| _ -> -1
let eval_hex_escape c1 c2 =
(eval_hex_char c1) * 16 +
(eval_hex_char c2)
type escape_sequence =
| Newline
| Other
let escaped_buf = Buffer.create 256
}
let comment = ';' [^ '\n' '\r']*
let newline = '\r'? '\n'
let blank = [' ' '\t' '\012']
let atom_char = [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012' '|' '#']
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
rule token = parse
| newline
{ Lexing.new_line lexbuf; token lexbuf }
| blank+ | comment
{ token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = quoted_string true lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| "#|"
{ block_comment lexbuf }
| "#;"
{ Sexp_comment }
| eof
{ Eof }
| ""
{ atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
and atom acc start = parse
| '#'+ '|'
{ lexbuf.lex_start_p <- start;
error lexbuf "atoms cannot contain #|"
}
| '|'+ '#'
{ lexbuf.lex_start_p <- start;
error lexbuf "atoms cannot contain |#"
}
| ('#'+ | '|'+ | atom_char+) as s
{ atom (if acc = "" then s else acc ^ s) start lexbuf
}
| ""
{ if acc = "" then
error lexbuf "Internal error in the S-expression parser, \
please report upstream.";
lexbuf.lex_start_p <- start;
Token.Atom (A acc)
}
(* If [strict] is false, ignore errors *)
and quoted_string strict = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence strict lexbuf with
| Newline -> quoted_string_after_escaped_newline strict lexbuf
| Other -> quoted_string strict lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string strict lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string strict lexbuf
}
| eof
{ if strict then
error lexbuf "unterminated quoted string";
Buffer.contents escaped_buf
}
and quoted_string_after_escaped_newline strict = parse
| [' ' '\t']*
{ quoted_string strict lexbuf }
and escape_sequence strict = parse
| newline
{ Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if strict && v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit* as s
{ if strict then
error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit* as s
{ if strict then
error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| _ as c
{ Buffer.add_char escaped_buf '\\';
Buffer.add_char escaped_buf c;
Other
}
| eof
{ if strict then
error lexbuf "unterminated escape sequence" ~delta:(-1);
Other
}
and block_comment = parse
| '"'
{ Buffer.clear escaped_buf;
ignore (quoted_string false lexbuf : string);
block_comment lexbuf
}
| "|#"
{ token lexbuf
}
| eof
{ error lexbuf "unterminated block comment"
}
| _
{ block_comment lexbuf
}

View File

@ -1,377 +0,0 @@
type stack =
| Empty
| Open of Lexing.position * stack
| Sexp of Sexp_ast.t * stack
let empty_stack = Empty
type 'a mode =
| Single : Sexp_ast.t mode
| Many : Sexp_ast.t list mode
type 'a state =
{ mutable automaton_state : int
; mutable depth : int
; (* Number of opened #| when parsing a block comment *)
mutable block_comment_depth : int
; (* Number of full sexp to ignore. This correspond to the number of consecutive #;.
We don't have to track nested #; because they do not change the result. *)
mutable ignoring : int
; (* Only meaningful when [ignoring] > 0. Number of opened parentheses of the
outermost sexp comment. *)
mutable ignoring_depth : int
; (* When parsing an escape sequence of the form "\\NNN" or "\\XX", this accumulates
the computed number *)
mutable escaped_value : int
; (* Buffer for accumulating atoms *)
atom_buffer : Buffer.t
; fname : string
; mutable full_sexps : int
; mutable offset : int (* global offset *)
; mutable line_number : int
; mutable bol_offset : int (* offset of beginning of line *)
; (* Starting positions of the current token *)
mutable token_start_pos : Lexing.position
; mode : 'a mode
}
(* these magic numbers are checked in gen_parser_automaton.ml:
let () = assert (initial = 0)
let () = assert (to_int Error = 1) *)
let initial_state = 0
let error_state = 1
let new_state ~fname mode =
{ depth = 0
; automaton_state = initial_state
; block_comment_depth = 0
; ignoring = 0
; ignoring_depth = 0
; escaped_value = 0
; atom_buffer = Buffer.create 128
; mode = mode
; full_sexps = 0
; offset = 0
; line_number = 1
; bol_offset = 0
; fname = fname
; token_start_pos = { pos_fname = fname; pos_cnum = 0; pos_lnum = 1; pos_bol = 0 }
}
let mode t = t.mode
let offset t = t.offset
let line t = t.line_number
let column t = t.offset - t.bol_offset
let position t =
{ Lexing.
pos_fname = t.fname
; pos_cnum = t.offset
; pos_lnum = t.line_number
; pos_bol = t.bol_offset
}
let has_unclosed_paren state = state.depth > 0
let set_error_state state = state.automaton_state <- error_state
let sexp_of_stack : stack -> Sexp_ast.t = function
| Sexp (sexp, Empty) -> sexp
| _ -> failwith "Parser_automaton.sexp_of_stack"
let sexps_of_stack =
let rec loop acc : stack -> Sexp_ast.t list = function
| Empty -> acc
| Open _ -> failwith "Parser_automaton.sexps_of_stack"
| Sexp (sexp, stack) -> loop (sexp :: acc) stack
in
fun stack -> loop [] stack
let automaton_state state = state.automaton_state
module Error = struct
type t =
{ position : Lexing.position
; message : string
}
exception Parse_error of t
module Reason = struct
(* To be kept in sync with the Error module in gen/gen_parser_automaton.ml *)
type t =
| Unexpected_char_parsing_hex_escape
| Unexpected_char_parsing_dec_escape
| Unterminated_quoted_string
| Unterminated_block_comment
| Escape_sequence_out_of_range
| Unclosed_paren
| Too_many_sexps
| Closed_paren_without_opened
| Comment_token_in_unquoted_atom
| Sexp_comment_without_sexp
| Unexpected_character_after_cr
| No_sexp_found_in_input
| Automaton_in_error_state
end
let raise state ~at_eof (reason : Reason.t) =
set_error_state state;
let message =
(* These messages where choosen such that we can build the various Sexplib parsing
functions on top of Parsexp and keep the same exceptions.
At the time of writing this, a simple layer on top of parsexp to implement the
sexplib API is passing all the sexplib tests.
Note that parsexp matches the semantic of Sexp.parse which is slightly
different from the ocamllex/ocamlyacc based parser of Sexplib. The latter one
is less tested and assumed to be less used. *)
match reason with
| Unexpected_char_parsing_hex_escape ->
"unterminated hexadecimal escape sequence"
| Unexpected_char_parsing_dec_escape ->
"unterminated decimal escape sequence"
| Unterminated_quoted_string ->
"unterminated quoted string"
| Unterminated_block_comment ->
"unterminated block comment"
| Escape_sequence_out_of_range ->
"escape sequence in quoted string out of range"
| Unclosed_paren ->
"unclosed parentheses at end of input"
| Too_many_sexps ->
"s-expression followed by data"
| Closed_paren_without_opened ->
"unexpected character: ')'"
| Comment_token_in_unquoted_atom ->
if Buffer.contents state.atom_buffer = "|" then
"illegal end of comment"
else
"comment tokens in unquoted atom"
| Sexp_comment_without_sexp ->
"unterminated sexp comment"
| Unexpected_character_after_cr ->
if at_eof then
"unexpected end of input after carriage return"
else
"unexpected character after carriage return"
| No_sexp_found_in_input ->
"no s-expression found in input"
| Automaton_in_error_state ->
failwith "Parsexp.Parser_automaton: parser is dead"
in
let position = position state in
raise (Parse_error { position; message })
let position t = t.position
let message t = t.message
end
exception Parse_error = Error.Parse_error
type 'a action = 'a state -> char -> stack -> stack
type 'a epsilon_action = 'a state -> stack -> stack
let current_pos ?(delta=0) state : Lexing.position =
let offset = state.offset + delta in
{ pos_fname = state.fname
; pos_lnum = state.line_number
; pos_cnum = offset
; pos_bol = state.bol_offset
}
let set_automaton_state state x = state.automaton_state <- x
let advance state = state.offset <- state.offset + 1
let advance_eol state =
let newline_offset = state.offset in
state.offset <- newline_offset + 1;
state.bol_offset <- state.offset;
state.line_number <- state.line_number + 1
let block_comment_depth state = state.block_comment_depth
let add_token_char _state _char stack = stack
let add_atom_char state c stack =
Buffer.add_char state.atom_buffer c;
stack
let add_quoted_atom_char state c stack =
Buffer.add_char state.atom_buffer c;
add_token_char state c stack
let check_new_sexp_allowed : type a. a state -> unit = fun state ->
let is_single = match state.mode with Single -> true | _ -> false in
if is_single && state.full_sexps > 0 && state.ignoring = 0 then
Error.raise state ~at_eof:false Too_many_sexps
let add_first_char state char stack =
check_new_sexp_allowed state;
Buffer.add_char state.atom_buffer char;
state.token_start_pos <- current_pos state;
stack
let eps_add_first_char_hash state stack =
check_new_sexp_allowed state;
Buffer.add_char state.atom_buffer '#';
state.token_start_pos <- current_pos state ~delta:(-1);
stack
let start_quoted_string state _char stack =
check_new_sexp_allowed state;
state.token_start_pos <- current_pos state;
stack
let add_escaped state c stack =
let c' =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| '\\' | '\'' | '"' -> c
| _ -> Buffer.add_char state.atom_buffer '\\'; c
in
Buffer.add_char state.atom_buffer c';
add_token_char state c stack
let eps_add_escaped_cr state stack =
Buffer.add_char state.atom_buffer '\r';
stack
let dec_val c = Char.code c - Char.code '0'
let hex_val c =
match c with
| '0'..'9' -> Char.code c - Char.code '0'
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
| _ -> Char.code c - Char.code 'A' + 10
let add_dec_escape_char state c stack =
state.escaped_value <- state.escaped_value * 10 + dec_val c;
add_token_char state c stack
let add_last_dec_escape_char state c stack =
let value = state.escaped_value * 10 + dec_val c in
state.escaped_value <- 0;
if value > 255 then Error.raise state ~at_eof:false Escape_sequence_out_of_range;
Buffer.add_char state.atom_buffer (Char.unsafe_chr value);
add_token_char state c stack
let comment_add_last_dec_escape_char state c stack =
let value = state.escaped_value * 10 + dec_val c in
state.escaped_value <- 0;
if value > 255 then Error.raise state ~at_eof:false Escape_sequence_out_of_range;
add_token_char state c stack
let add_hex_escape_char state c stack =
state.escaped_value <- (state.escaped_value lsl 4) lor hex_val c;
add_token_char state c stack
let add_last_hex_escape_char state c stack =
let value = (state.escaped_value lsl 4) lor hex_val c in
state.escaped_value <- 0;
Buffer.add_char state.atom_buffer (Char.unsafe_chr value);
add_token_char state c stack
let opening state _char stack =
check_new_sexp_allowed state;
state.depth <- state.depth + 1;
if state.ignoring = 0 then
Open (current_pos state, stack)
else
stack
let sexp_or_comment_added ~is_comment state stack ~delta:_ =
if state.ignoring lor state.depth <> 0 then
stack
else begin
if not is_comment then state.full_sexps <- state.full_sexps + 1;
stack
end
let sexp_added state stack ~delta =
let stack = sexp_or_comment_added ~is_comment:false state stack ~delta in
if state.ignoring <> 0 && state.ignoring_depth = state.depth then begin
state.ignoring <- state.ignoring - 1;
stack
end else stack
let rec make_list stop acc : stack -> stack = function
| Empty -> assert false
| Open (start, stack) -> Sexp (List ({ start; stop }, acc), stack)
| Sexp (sexp, stack) -> make_list stop (sexp :: acc) stack
let closing state _char stack =
if state.depth > 0 then begin
let stack =
if state.ignoring = 0 then
make_list (current_pos state ~delta:1) [] stack
else
stack
in
if state.ignoring <> 0 && state.ignoring_depth = state.depth then
Error.raise state ~at_eof:false Sexp_comment_without_sexp;
state.depth <- state.depth - 1;
sexp_added state stack ~delta:1
end else
Error.raise state ~at_eof:false Closed_paren_without_opened
let make_loc ?(delta=0) state : Sexp_ast.Loc.t =
{ start = state.token_start_pos
; stop = current_pos state ~delta
}
let eps_push_atom state stack =
let str = Buffer.contents state.atom_buffer in
Buffer.clear state.atom_buffer;
let stack =
if state.ignoring = 0 then
Sexp (Atom (make_loc state, A str), stack)
else
stack
in
sexp_added state stack ~delta:0
let push_quoted_atom state _char stack =
let str = Buffer.contents state.atom_buffer in
Buffer.clear state.atom_buffer;
let stack =
if state.ignoring = 0 then
Sexp (Quoted_string (make_loc state ~delta:1, str), stack)
else
stack
in
sexp_added state stack ~delta:1
let start_sexp_comment state _char stack =
if state.ignoring = 0 || state.ignoring_depth = state.depth then begin
state.ignoring <- state.ignoring + 1;
state.ignoring_depth <- state.depth
end;
stack
let start_block_comment state _char stack =
state.block_comment_depth <- state.block_comment_depth + 1;
stack
let end_block_comment state _char stack =
state.block_comment_depth <- state.block_comment_depth - 1;
stack
let start_line_comment _state _char stack = stack
let end_line_comment _state stack = stack
let eps_eoi_check : type a. a state -> stack -> stack = fun state stack ->
if state.depth > 0 then Error.raise state ~at_eof:true Unclosed_paren;
if state.ignoring > 0 then Error.raise state ~at_eof:true Sexp_comment_without_sexp;
if state.full_sexps = 0 then (
match state.mode with
| Many -> ()
| Single ->
Error.raise state ~at_eof:true No_sexp_found_in_input
);
stack

View File

@ -1,132 +0,0 @@
(** Internal bits used by the generated automaton, not part of the public API *)
(** Internal state of the automaton *)
type 'a state
type 'a mode =
| Single : Sexp_ast.t mode
| Many : Sexp_ast.t list mode
type stack
val empty_stack : stack
val new_state
: fname:string
-> 'a mode
-> 'a state
val mode : 'a state -> 'a mode
(** Number of characters fed to the parser *)
val offset : _ state -> int
(** Position in the text *)
val line : _ state -> int
val column : _ state -> int
(** Whether there are some unclosed parentheses *)
val has_unclosed_paren : _ state -> bool
val set_error_state : _ state -> unit
val sexp_of_stack : stack -> Sexp_ast.t
val sexps_of_stack : stack -> Sexp_ast.t list
val automaton_state : _ state -> int
module Error : sig
type t
val position : t -> Lexing.position
val message : t -> string
module Reason : sig
type t =
| Unexpected_char_parsing_hex_escape
| Unexpected_char_parsing_dec_escape
| Unterminated_quoted_string
| Unterminated_block_comment
| Escape_sequence_out_of_range
| Unclosed_paren
| Too_many_sexps
| Closed_paren_without_opened
| Comment_token_in_unquoted_atom
| Sexp_comment_without_sexp
| Unexpected_character_after_cr
| No_sexp_found_in_input
| Automaton_in_error_state
end
val raise : _ state -> at_eof:bool -> Reason.t -> _
end
exception Parse_error of Error.t
val set_automaton_state : _ state -> int -> unit
(** Advance the position counters. [advance_eol] is for when we read a newline
character. *)
val advance : _ state -> unit
val advance_eol : _ state -> unit
(** Number of opened #| *)
val block_comment_depth : _ state -> int
type 'a action = 'a state -> char -> stack -> stack
type 'a epsilon_action = 'a state -> stack -> stack
(** Add a character to the atom buffer. [add_quoted_atom_char] does the same for quoted
atoms *)
val add_atom_char : _ action
val add_quoted_atom_char : _ action
(** Add a character that just follows a '\\' and the '\\' itself if necessary. *)
val add_escaped : _ action
(** [escaped_value <- escaped_value * 10 + (char - '0')]
These functions make the assumption that [char] is between '0' and '9'.
[add_dec_escape_char] also assumes the result doesn't overflow. The automaton
definition must make sure this is the case.
[add_last_dec_escape_char] also adds the resulting character to the atom buffer.
*)
val add_dec_escape_char : _ action
val add_last_dec_escape_char : _ action
(** Same but for quoted strings inside comments. Useful because it can fail. *)
val comment_add_last_dec_escape_char : _ action
(** Same as [add_dec_escape_char] but for hexadicemal escape sequences *)
val add_hex_escape_char : _ action
val add_last_hex_escape_char : _ action
(** Ignore one more full sexp to come *)
val start_sexp_comment : _ action
(** Add the first char of an unquoted atom. *)
val add_first_char : _ action
val start_quoted_string : _ action
(** Takes note of a control character in quoted atoms or the uninterpreted characters of
comments, for which there is no corresponding [add_*] call (a backslash and the x in
"\xff" or any character in a line comment). This does not get called for the opening
([start_quoted_string]) or closing ([push_quoted_atom]) quotes themselves.
*)
val add_token_char : _ action
val opening : _ action
val closing : _ action
val push_quoted_atom : _ action
val start_block_comment : _ action
val end_block_comment : _ action
val start_line_comment : _ action
val end_line_comment : _ epsilon_action
val eps_push_atom : _ epsilon_action
val eps_add_first_char_hash : _ epsilon_action
val eps_eoi_check : _ epsilon_action
val eps_add_escaped_cr : _ epsilon_action

View File

@ -1,13 +0,0 @@
module Loc = struct
type t =
{ start : Lexing.position
; stop : Lexing.position
}
end
type atom = A of string [@@unboxed]
type t =
| Atom of Loc.t * atom
| Quoted_string of Loc.t * string
| List of Loc.t * t list

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +0,0 @@
val transitions : Obj.t Parser_automaton_internal.action array
val transitions_eoi : Obj.t Parser_automaton_internal.epsilon_action array

View File

@ -10,10 +10,8 @@ module Bytes = struct
UnlabeledBytes.blit_string src src_pos dst dst_pos len
end
module A = Parser_automaton_internal
module Atom = struct
type t = Sexp_ast.atom = A of string [@@unboxed]
type t = Lexer.Atom.t = A of string [@@unboxed]
let is_valid str =
let len = String.length str in
@ -241,7 +239,10 @@ let prepare_formatter ppf =
}
module Loc = struct
include Sexp_ast.Loc
type t =
{ start : Lexing.position
; stop : Lexing.position
}
let in_file fn =
let pos : Lexing.position =
@ -257,7 +258,7 @@ module Loc = struct
end
module Ast = struct
type t = Sexp_ast.t =
type t =
| Atom of Loc.t * Atom.t
| Quoted_string of Loc.t * string
| List of Loc.t * t list
@ -272,32 +273,6 @@ let loc (Atom (loc, _) | Quoted_string (loc, _) | List (loc, _)) = loc
| Atom (_, s) -> Atom s
| Quoted_string (_, s) -> Quoted_string s
| List (_, l) -> List (List.map l ~f:remove_locs)
module Token = struct
type t =
| Atom of Loc.t * Atom.t
| String of Loc.t * string
| Lparen of Loc.t
| Rparen of Loc.t
end
let tokenize =
let rec loop acc t =
match t with
| Atom (loc, s) -> Token.Atom (loc, s) :: acc
| Quoted_string (loc, s) -> Token.String (loc, s) :: acc
| List (loc, l) ->
let shift (pos : Lexing.position) delta =
{ pos with pos_cnum = pos.pos_cnum + delta }
in
let l_loc = { loc with stop = shift loc.start 1 } in
let r_loc = { loc with start = shift loc.stop (-1) } in
let acc = Token.Lparen l_loc :: acc in
let acc = List.fold_left l ~init:acc ~f:loop in
let acc = Token.Rparen r_loc :: acc in
acc
in
fun t -> loop [] t |> List.rev
end
let rec add_loc t ~loc : Ast.t =
@ -306,78 +281,89 @@ let rec add_loc t ~loc : Ast.t =
| Quoted_string s -> Quoted_string (loc, s)
| List l -> List (loc, List.map l ~f:(add_loc ~loc))
module Parse_error = struct
include Lexer.Error
let loc t : Loc.t = { start = t.start; stop = t.stop }
let message t = t.message
end
exception Parse_error = Lexer.Error
module Lexer = Lexer
module Parser = struct
module Error = A.Error
exception Error = A.Parse_error
let error (loc : Loc.t) message =
raise (Parse_error
{ start = loc.start
; stop = loc.stop
; message
})
let make_loc lexbuf : Loc.t =
{ start = Lexing.lexeme_start_p lexbuf
; stop = Lexing.lexeme_end_p lexbuf
}
module Mode = struct
type 'a t = 'a A.mode =
type 'a t =
| Single : Ast.t t
| Many : Ast.t list t
let make_result : type a. a t -> Lexing.lexbuf -> Ast.t list -> a
= fun t lexbuf sexps ->
match t with
| Many -> sexps
| Single ->
match sexps with
| [sexp] -> sexp
| [] -> error (make_loc lexbuf) "no s-expression found in input"
| _ :: sexp :: _ ->
error (Ast.loc sexp) "too many s-expressions found in input"
end
module Stack = struct
type t = A.stack
let empty = A.empty_stack
end
let rec loop depth lexer lexbuf acc =
match (lexer lexbuf : Lexer.Token.t) with
| Atom a ->
let loc = make_loc lexbuf in
loop depth lexer lexbuf (Ast.Atom (loc, a) :: acc)
| Quoted_string s ->
let loc = make_loc lexbuf in
loop depth lexer lexbuf (Quoted_string (loc, s) :: acc)
| Lparen ->
let start = Lexing.lexeme_start_p lexbuf in
let sexps = loop (depth + 1) lexer lexbuf [] in
let stop = Lexing.lexeme_end_p lexbuf in
loop depth lexer lexbuf (List ({ start; stop }, sexps) :: acc)
| Rparen ->
if depth = 0 then
error (make_loc lexbuf)
"right parenthesis without matching left parenthesis";
List.rev acc
| Sexp_comment ->
let sexps =
let loc = make_loc lexbuf in
match loop depth lexer lexbuf [] with
| _ :: sexps -> sexps
| [] -> error loc "s-expression missing after #;"
in
List.rev_append acc sexps
| Eof ->
if depth > 0 then
error (make_loc lexbuf)
"unclosed parenthesis at end of input";
List.rev acc
type 'a t = 'a A.state
let create ~fname ~mode = A.new_state ~fname mode
let feed : type a. a A.action = fun state char stack ->
let idx = (A.automaton_state state lsl 8) lor (Char.code char) in
(* We need an Obj.magic as the type of the array can't be generalized.
This problem will go away when we get immutable arrays. *)
(Obj.magic (Table.transitions.(idx) : Obj.t A.action) : a A.action) state char stack
[@@inline always]
let feed_eoi : type a. a t -> Stack.t -> a = fun state stack ->
let stack =
(Obj.magic (Table.transitions_eoi.(A.automaton_state state)
: Obj.t A.epsilon_action)
: a A.epsilon_action) state stack
in
A.set_error_state state;
match A.mode state with
| Mode.Single -> A.sexp_of_stack stack
| Mode.Many -> A.sexps_of_stack stack
let rec feed_substring_unsafe str state stack i stop =
if i < stop then
let c = String.unsafe_get str i in
let stack = feed state c stack in
feed_substring_unsafe str state stack (i + 1) stop
else
stack
let rec feed_subbytes_unsafe str state stack i stop =
if i < stop then
let c = Bytes.unsafe_get str i in
let stack = feed state c stack in
feed_subbytes_unsafe str state stack (i + 1) stop
else
stack
let feed_substring state str ~pos ~len stack =
let str_len = String.length str in
if pos < 0 || len < 0 || pos > str_len - len then
invalid_arg "Jbuilder_sexp.feed_substring";
feed_substring_unsafe str state stack pos (pos + len)
let feed_subbytes state str ~pos ~len stack =
let str_len = Bytes.length str in
if pos < 0 || len < 0 || pos > str_len - len then
invalid_arg "Jbuilder_sexp.feed_subbytes";
feed_subbytes_unsafe str state stack pos (pos + len)
let feed_string state str stack =
feed_substring_unsafe str state stack 0 (String.length str)
let feed_bytes state str stack =
feed_subbytes_unsafe str state stack 0 (Bytes.length str)
let parse ~mode ?(lexer=Lexer.token) lexbuf =
loop 0 lexer lexbuf []
|> Mode.make_result mode lexbuf
end
let parse_string ~fname ~mode str =
let p = Parser.create ~fname ~mode in
let stack = Parser.feed_string p str Parser.Stack.empty in
Parser.feed_eoi p stack
let parse_string ~fname ~mode ?lexer str =
let lb = Lexing.from_string str in
lb.lex_curr_p <-
{ pos_fname = fname
; pos_lnum = 1
; pos_bol = 0
; pos_cnum = 0
};
Parser.parse ~mode ?lexer lb

View File

@ -74,63 +74,53 @@ module Ast : sig
val loc : t -> Loc.t
val remove_locs : t -> sexp
module Token : sig
type t =
| Atom of Loc.t * Atom.t
| String of Loc.t * string
| Lparen of Loc.t
| Rparen of Loc.t
end
val tokenize : t -> Token.t list
end with type sexp := t
val add_loc : t -> loc:Loc.t -> Ast.t
module Parser : sig
module Error : sig
type t
module Parse_error : sig
type t
val position : t -> Lexing.position
val message : t -> string
val loc : t -> Loc.t
val message : t -> string
end
(** Exception raised in case of a parsing error *)
exception Parse_error of Parse_error.t
module Lexer : sig
module Token : sig
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment (** "#;", only used in the jbuild syntax *)
| Eof
end
(** Exception raised in case of a parsing error *)
exception Error of Error.t
type t = Lexing.lexbuf -> Token.t
val token : t
end
module Parser : sig
module Mode : sig
type sexp = t
type 'a t =
| Single : Ast.t t
| Many : Ast.t list t
end with type sexp := t
module Stack : sig
(** Parser stack. The stack is not in [state] for optimization purposes. *)
type t
val empty : t
end
type 'a t
(** Create a new parser state. [fname] is the filename the input is from. *)
val create : fname:string -> mode:'a Mode.t -> 'a t
(** Feed one character to the parser. In case of error, it raises [Parse_error] *)
val feed : _ t -> char -> Stack.t -> Stack.t
(** Instruct the parser that the end of input was reached. In case of error, it raises
[Parse_error] *)
val feed_eoi : 'a t -> Stack.t -> 'a
(** {3 Convenience functions} *)
val feed_string : _ t -> string -> Stack.t -> Stack.t
val feed_substring : _ t -> string -> pos:int -> len:int -> Stack.t -> Stack.t
val feed_bytes : _ t -> bytes -> Stack.t -> Stack.t
val feed_subbytes : _ t -> bytes -> pos:int -> len:int -> Stack.t -> Stack.t
val parse
: mode:'a Mode.t
-> ?lexer:Lexer.t
-> Lexing.lexbuf
-> 'a
end
val parse_string : fname:string -> mode:'a Parser.Mode.t -> string -> 'a
val parse_string
: fname:string
-> mode:'a Parser.Mode.t
-> ?lexer:Lexer.t
-> string
-> 'a

View File

@ -38,3 +38,33 @@ let x = of_sexp sexp
val of_sexp : int list Stdune.Sexp.Of_sexp.t = <fun>
val x : int list = [1; 2]
|}]
let parse s =
try
Sexp.parse_string ~fname:"" ~mode:Many s
with Sexp.Parse_error e ->
failwith (Sexp.Parse_error.message e)
[%%expect{|
val parse : string -> Usexp.Ast.t list = <fun>
|}]
parse {| # ## x##y x||y a#b|c#d copy# |}
[%%expect{|
- : Usexp.Ast.t list = [#; ##; x##y; x||y; a#b|c#d; copy#]
|}]
parse {|x #| comment |# y|}
[%%expect{|
- : Usexp.Ast.t list = [x; y]
|}]
parse {|x#|y|}
[%%expect{|
Exception: Failure "atoms cannot contain #|".
|}]
parse {|x|#y|}
[%%expect{|
Exception: Failure "atoms cannot contain |#".
|}]