Import the Sexp parser generator (#608)

This commit is contained in:
Jérémie Dimino 2018-03-13 19:06:34 +00:00 committed by GitHub
parent 338f4c9ff2
commit 8e4c9fce4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 521 additions and 2 deletions

View File

@ -54,5 +54,8 @@ livedoc:
update-jbuilds: $(BIN)
$(BIN) build --dev @doc/runtest --auto-promote
update-sexp-parser:
$(BIN) build --dev @update-sexp-parser --auto-promote
.PHONY: default install uninstall reinstall clean test doc
.PHONY: promote accept-corrections

View File

@ -0,0 +1,112 @@
(* Parsing of S-expression. The parsing is written as an automaton for which
we provide different implementations of actions.
*)
open Base
open Stdio
open Gen_parsexp_lib.Automaton
open Gen_parsexp_lib.Automaton.Table
(* Sharing of transitions *)
module Sharing = struct
let create_assign_id () =
let cache = Hashtbl.Poly.create () in
(cache,
fun x ->
if not (Hashtbl.mem cache x) then
Hashtbl.add_exn cache ~key:x ~data:(Hashtbl.length cache))
let share (table : t) =
let transitions, assign_transition_id =
create_assign_id ()
in
let transitions_eoi, assign_transition_eoi_id =
create_assign_id ()
in
Array.iter table.transitions ~f:assign_transition_id;
Array.iter table.transitions_eoi ~f:assign_transition_eoi_id;
(transitions, transitions_eoi)
end
let gen_code oc (t : t) =
let (named_transitions, named_transitions_eoi) = Sharing.share t in
let pr fmt = Out_channel.fprintf oc (Caml.(^^) fmt "\n") in
pr "(* generated by %s *)" Caml.Sys.argv.(0);
pr "";
pr "open Parser_automaton_internal";
pr "";
pr "let raise = Parser_automaton_internal.Error.raise";
pr "";
let ordered_ids tbl =
Hashtbl.fold tbl ~init:[] ~f:(fun ~key:x ~data:id acc -> (id, x) :: acc)
|> List.sort ~cmp:(fun (id1, _) (id2, _) -> compare id1 id2)
in
List.iter (ordered_ids named_transitions)
~f:(fun (id, tr) ->
match tr with
| Error error ->
pr "let tr_%02d _state _char _stack =" id;
pr " raise _state ~at_eof:false %s" (Error.to_string error)
| Ok { action = (eps_actions, action); goto; advance } ->
let eps_actions =
List.filter_map ~f:Epsilon_action.to_runtime_function eps_actions
in
let action = Action.to_runtime_function action in
pr "let tr_%02d state %schar stack =" id
(if Option.is_none action &&
not ([%compare.equal: goto_state] goto End_block_comment) then
"_"
else
"");
List.iter eps_actions ~f:(pr " let stack = %s state stack in");
(match action with
| None -> ()
| Some s -> pr " let stack = %s state char stack in" s);
(match goto with
| State n -> pr " set_automaton_state state %d;" n
| End_block_comment ->
pr " let stack = end_block_comment state char stack in";
pr " set_automaton_state state \
(if block_comment_depth state <> 0 then %d else %d);"
(State.to_int (Block_comment Normal)) (State.to_int Whitespace));
pr " %s state;"
(match advance with
| Advance -> "advance"
| Advance_eol -> "advance_eol");
pr " stack"
);
pr "";
List.iter (ordered_ids named_transitions_eoi) ~f:(fun (id, tr) ->
match tr with
| Error error ->
pr "let tr_eoi_%02d state _stack =" id;
pr " raise state ~at_eof:true %s" (Error.to_string error)
| Ok eps_actions ->
pr "let tr_eoi_%02d state stack =" id;
let eps_actions =
List.filter_map eps_actions ~f:Epsilon_action.to_runtime_function
in
List.iter eps_actions ~f:(pr " let stack = %s state stack in");
pr " eps_eoi_check state stack");
pr "";
let pr_table ?(per_line=1) suffix tbl ids =
pr "let transitions%s =" suffix;
let len = Array.length tbl in
let lines = len / per_line in
assert (per_line * lines = len);
for l = 0 to lines - 1 do
Out_channel.fprintf oc (if l = 0 then " [|" else " ;");
for col = 0 to per_line - 1 do
if col > 0 then Out_channel.fprintf oc ";";
let i = l * per_line + col in
Out_channel.fprintf oc " tr%s_%02d" suffix (Hashtbl.find_exn ids tbl.(i))
done;
Out_channel.fprintf oc "\n"
done;
pr " |]";
pr "";
in
pr_table "" t.transitions named_transitions ~per_line:8;
pr_table "_eoi" t.transitions_eoi named_transitions_eoi
let () = gen_code Caml.stdout table

4
src/usexp/gen/jbuild Normal file
View File

@ -0,0 +1,4 @@
(executable
((name gen_parser_automaton)
(libraries (base stdio gen_parsexp_lib))
(preprocess (pps (ppx_sexp_conv ppx_compare)))))

View File

@ -0,0 +1,383 @@
(* Abstract version of the parsing automaton.
It is used in two places:
- to define the automaton. At runtime, we only use integer for states and a table of
functions for transitions
- for tests
*)
open Base
module State = struct
module Quoted_string = struct
type t =
| Normal
| After_backslash
| After_backslash_cr
| After_backslash_digit
| After_backslash_2digits
| After_backslash_x
| After_backslash_x_hex
| Ignoring_blanks
[@@deriving enumerate, compare, sexp_of]
end
module Block_comment = struct
type t =
| Normal
| After_pipe
| After_hash
| Quoted_string of Quoted_string.t
[@@deriving enumerate, compare, sexp_of]
end
module Unquoted_string = struct
type t =
| Normal
| After_hash
| After_pipe
[@@deriving enumerate, compare, sexp_of]
end
type t =
| Whitespace
| Error
| After_cr
| Unquoted_string of Unquoted_string.t
| Line_comment
| After_hash
| Quoted_string of Quoted_string.t
| Block_comment of Block_comment.t
[@@deriving enumerate, compare, sexp_of]
let to_int t =
let rec loop i t l =
match l with
| [] -> assert false
| x :: l -> if [%compare.equal: t] t x then i else loop (i + 1) t l
in
loop 0 t all
let of_int i = List.nth_exn all i
let count = List.length all
let initial = to_int Whitespace
let () = assert (initial = 0) (* This is assumed in parser_automaton_internal.ml *)
let () = assert (to_int Error = 1) (* This is assumed in parser_automaton_internal.ml *)
let old_parser_approx_cont_state = function
| Whitespace -> "Parsing_toplevel_whitespace"
| After_cr -> "Parsing_nested_whitespace"
| Unquoted_string _
| Quoted_string _ -> "Parsing_atom"
| After_hash -> "Parsing_atom"
| Block_comment _ -> "Parsing_block_comment"
| Line_comment -> "Parsing_toplevel_whitespace"
(* This cannot happen with the old parser so the result is a dummy value *)
| Error -> "Parsing_toplevel_whitespace"
end
module Error = struct
(* Subset of the [Parser_automaton_internal.Public.Error.Reason.t] type *)
type t =
| Unexpected_char_parsing_hex_escape
| Unexpected_char_parsing_dec_escape
| Unterminated_quoted_string
| Unterminated_block_comment
| Comment_token_in_unquoted_atom
| Unexpected_character_after_cr
| Automaton_in_error_state
[@@deriving compare, sexp_of, hash, variants]
let to_string = Variants.to_name
end
(* Action associated to transitions. Actions correspond to the similarly named functions
in ../parser_automaton_internal.mli. *)
module Action = struct
type t =
| Nop
| Opening
| Closing
| Add_atom_char
| Add_quoted_atom_char
| Add_first_char
| Add_escaped
| Add_hex_escape_char
| Add_dec_escape_char
| Add_last_hex_escape_char
| Add_last_dec_escape_char
| Add_token_char
| Comment_add_last_dec_escape_char
| Push_quoted_atom
| Start_quoted_string
| Start_block_comment
| Start_sexp_comment
| Start_line_comment
[@@deriving compare, sexp_of, hash, variants]
let to_runtime_function = function
| Nop -> None
| t -> Some (String.uncapitalize (Variants.to_name t))
end
(* Action associated to epsilon transitions, i.e. transitions that do not consume a
character.
Having epsilon actions makes the definition of the automaton much simpler. *)
module Epsilon_action = struct
type t =
| Nop
| Push_atom
| Add_first_char_hash
| Add_escaped_cr
| End_line_comment
[@@deriving compare, sexp_of, hash, variants]
let to_runtime_function = function
| Nop -> None
| End_line_comment -> Some "end_line_comment"
| t -> Some ("eps_" ^ String.uncapitalize (Variants.to_name t))
end
module Transition = struct
type t =
| T of Action.t * State.t
| E of Epsilon_action.t * State.t
| Error of Error.t
| End_block_comment (* can't be a normal transition, as the new state isn't known
statically *)
[@@deriving compare]
end
module Final_transition = struct
type t =
| Eoi_check
| E of Epsilon_action.t * State.t
| Error of Error.t
end
module type Automaton = sig
val transition : State.t * char -> Transition.t
val transition_eoi : State.t -> Final_transition.t
end
(* Definition of the automaton, compiled later to a transition table. *)
module Automaton : Automaton = struct
module Quoted_string_transition = struct
type t =
| T of Action.t * State.Quoted_string.t
| E of Epsilon_action.t * State.Quoted_string.t
| Error of Error.t
| End_of_quoted_string
end
type context = In_block_comment | In_atom
let quoted_string_transition
: context -> State.Quoted_string.t * char -> Quoted_string_transition.t
= fun context x ->
(* Distinguising atom and block comments is to optimize block comments. But
we musn't optimize the exception on things like \321. *)
let if_atom then_ else_ : Action.t =
match context with
| In_atom -> then_
| In_block_comment -> else_
in
let if_atom_eps then_ else_ : Epsilon_action.t =
match context with
| In_atom -> then_
| In_block_comment -> else_
in
match x with
| Normal, '"' -> End_of_quoted_string
| Normal, '\\' -> T (Add_token_char, After_backslash)
| Normal, _ -> T (if_atom Add_quoted_atom_char Add_token_char, Normal)
| After_backslash, '\n' ->
T (Add_token_char, Ignoring_blanks)
| After_backslash, '\r' ->
T (Add_token_char, After_backslash_cr)
| After_backslash, 'x' ->
T (Add_token_char, After_backslash_x)
| After_backslash, '0'..'9' ->
T (Add_dec_escape_char, After_backslash_digit)
| After_backslash, _ ->
T (if_atom Add_escaped Add_token_char, Normal)
| After_backslash_cr, '\n' ->
T (Add_token_char, Ignoring_blanks)
| After_backslash_cr, _ ->
E (if_atom_eps Add_escaped_cr Nop, Normal)
| After_backslash_x, ('0'..'9' | 'a'..'f' | 'A'..'F') ->
T (if_atom Add_hex_escape_char Add_token_char , After_backslash_x_hex)
| After_backslash_x, _ ->
Error Unexpected_char_parsing_hex_escape
| After_backslash_x_hex, ('0'..'9' | 'a'..'f' | 'A'..'F') ->
T (if_atom Add_last_hex_escape_char Add_token_char, Normal)
| After_backslash_x_hex, _ ->
Error Unexpected_char_parsing_hex_escape
| After_backslash_digit, '0'..'9' ->
T (Add_dec_escape_char, After_backslash_2digits)
| After_backslash_digit, _ ->
Error Unexpected_char_parsing_dec_escape
| After_backslash_2digits, '0'..'9' ->
T (if_atom Add_last_dec_escape_char Comment_add_last_dec_escape_char, Normal)
| After_backslash_2digits, _ ->
Error Unexpected_char_parsing_dec_escape
| Ignoring_blanks, (' '|'\t') ->
T (Add_token_char, Ignoring_blanks)
| Ignoring_blanks, _ ->
E (Nop, Normal)
module Block_comment_transition = struct
type t =
| T of Action.t * State.Block_comment.t
| E of Epsilon_action.t * State.Block_comment.t
| Error of Error.t
| End_comment
end
let block_comment_transition
: State.Block_comment.t * char -> Block_comment_transition.t
= function
| Quoted_string state, c -> (
match quoted_string_transition In_block_comment (state, c) with
| End_of_quoted_string -> T (Add_token_char, Normal)
| T (action, state) -> T (action, Quoted_string state)
| E (action, state) -> E (action, Quoted_string state)
| Error error -> Error error
)
| After_hash, '|' -> T (Start_block_comment, Normal)
| After_pipe, '#' -> End_comment
| _, '"' -> T (Add_token_char, Quoted_string Normal)
| _, '|' -> T (Add_token_char, After_pipe)
| _, '#' -> T (Add_token_char, After_hash)
| _, _ -> T (Add_token_char, Normal)
let transition : State.t * char -> Transition.t = function
| Whitespace, '(' -> T (Opening, Whitespace)
| Whitespace, ')' -> T (Closing, Whitespace)
| Whitespace, '\r' -> T (Nop, After_cr)
| Whitespace, (' ' | '\t' | '\012' | '\n') -> T (Nop, Whitespace)
| Whitespace, ';' -> T (Start_line_comment, Line_comment)
| Whitespace, '"' -> T (Start_quoted_string, Quoted_string (Normal))
| Whitespace, '#' -> T (Nop, After_hash)
| Whitespace, '|' -> T (Add_first_char, Unquoted_string After_pipe)
| Whitespace, _ -> T (Add_first_char, Unquoted_string Normal)
| After_cr, '\n' -> T (Nop, Whitespace)
| After_cr, _ -> Error Unexpected_character_after_cr
| Unquoted_string _, (';'|'('|')'|'"'|' '|'\t'|'\012'|'\r'|'\n') ->
E (Push_atom, Whitespace)
| Unquoted_string After_hash, '|'
| Unquoted_string After_pipe, '#' -> Error Comment_token_in_unquoted_atom
| Unquoted_string _, '#' -> T (Add_atom_char, Unquoted_string After_hash)
| Unquoted_string _, '|' -> T (Add_atom_char, Unquoted_string After_pipe)
| Unquoted_string _, _ -> T (Add_atom_char, Unquoted_string Normal)
| Line_comment, ('\r' | '\n') -> E (End_line_comment, Whitespace)
| Line_comment, _ -> T (Add_token_char, Line_comment)
| After_hash, ';' -> T (Start_sexp_comment, Whitespace)
| After_hash, '|' -> T (Start_block_comment, Block_comment Normal)
| After_hash, _ -> E (Add_first_char_hash, Unquoted_string Normal)
| Quoted_string state, c -> (
match quoted_string_transition In_atom (state, c) with
| End_of_quoted_string -> T (Push_quoted_atom, Whitespace)
| T (action, state) -> T (action, Quoted_string state)
| E (action, state) -> E (action, Quoted_string state)
| Error error -> Error error
)
| Block_comment state, c -> (
match block_comment_transition (state, c) with
| T (action, state) -> T (action, Block_comment state)
| E (action, state) -> E (action, Block_comment state)
| End_comment -> End_block_comment
| Error error -> Error error
)
| Error, _ -> Error Automaton_in_error_state
let transition_eoi : State.t -> Final_transition.t = function
| Whitespace -> Eoi_check
| After_cr -> Error Unexpected_character_after_cr
| Unquoted_string _ -> E (Push_atom, Whitespace)
| Line_comment -> E (End_line_comment, Whitespace)
| After_hash -> E (Add_first_char_hash, Unquoted_string Normal)
| Quoted_string _ -> Error Unterminated_quoted_string
| Block_comment _ -> Error Unterminated_block_comment
| Error -> Error Automaton_in_error_state
end
module Table = struct
type action = Epsilon_action.t list * Action.t [@@deriving compare, sexp_of, hash]
type goto_state = State of int | End_block_comment [@@deriving compare, sexp_of, hash]
type advance = Advance | Advance_eol [@@deriving compare, sexp_of, hash]
type transition = { action : action; goto : goto_state; advance : advance }
[@@deriving compare, sexp_of, hash]
type 'a or_error =
| Ok of 'a
| Error of Error.t
[@@deriving compare, sexp_of, hash]
type t =
{ transitions : transition or_error array
; transitions_eoi : Epsilon_action.t list or_error array
}
let advance = function
| '\n' -> Advance_eol
| _ -> Advance
let compile (module A : Automaton) =
let rec squash acc state c =
match A.transition (state, c) with
| T (action, state) -> Ok { action = (List.rev acc, action)
; goto = State (State.to_int state)
; advance = advance c
}
| E (action, state) -> squash (action :: acc) state c
| Error error -> Error error
| End_block_comment -> Ok { action = (List.rev acc, Nop)
; goto = End_block_comment
; advance = advance c
}
in
let rec squash_eoi acc state =
match A.transition_eoi state with
| Eoi_check -> Ok (List.rev acc)
| E (eps_action, state) -> squash_eoi (eps_action :: acc) state
| Error error -> Error error
in
let transitions =
Array.create ~len:(State.count * 256)
(Ok { action = ([], Action.Nop)
; goto = State 0
; advance = Advance
})
in
let transitions_eoi = Array.create (Ok []) ~len:State.count in
for s = 0 to State.count - 1 do
let state = State.of_int s in
for c = 0 to 255 do
transitions.(s * 256 + c) <- squash [] state (Char.of_int_exn c);
done;
transitions_eoi.(s) <- squash_eoi [] state
done;
{ transitions; transitions_eoi }
end
let table = Table.compile (module Automaton)

10
src/usexp/gen/lib/jbuild Normal file
View File

@ -0,0 +1,10 @@
(library
((name gen_parsexp_lib)
(libraries (base))
(preprocess (pps (ppx_sexp_conv
ppx_compare
ppx_enumerate
ppx_hash
ppx_variants_conv)))))
(jbuild_version 1)

View File

@ -1,3 +1,10 @@
(jbuild_version 1)
(library ((name usexp)))
(rule
(with-stdout-to table.ml.gen (run gen/gen_parser_automaton.exe)))
(alias
((name update-sexp-parser)
(action (diff table.ml table.ml.gen))))

View File

@ -1,5 +1,4 @@
(* This file was copied and adapted from the src/parser_automaton.ml file of
https://github.com/janestreet/parsexp *)
(* generated by ./gen/gen_parser_automaton.exe *)
open Parser_automaton_internal
@ -1317,3 +1316,4 @@ let transitions_eoi =
; tr_eoi_07
; tr_eoi_07
|]