Merge branch 'master' into faster-bootstrap

This commit is contained in:
Rudi Grinberg 2018-06-20 21:09:17 +06:30 committed by GitHub
commit c723150026
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 439 additions and 321 deletions

View File

@ -3,4 +3,4 @@
(synopsis "[Internal] S-expression library")
(public_name dune.usexp)))
(ocamllex (lexer))
(ocamllex (dune_lexer jbuild_lexer))

1
src/usexp/dune_lexer.mli Normal file
View File

@ -0,0 +1 @@
val token : Lexer_shared.t

175
src/usexp/dune_lexer.mll Normal file
View File

@ -0,0 +1,175 @@
{
open Lexer_shared
type block_string_line_kind =
| With_escape_sequences
| Raw
}
let comment = ';' [^ '\n' '\r']*
let newline = '\r'? '\n'
let blank = [' ' '\t' '\012']
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char = [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
rule token = parse
| newline
{ Lexing.new_line lexbuf; token lexbuf }
| blank+ | comment
{ token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = start_quoted_string lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| atom_char+ as s
{ Token.Atom (Atom.of_string s) }
| _ as c { error lexbuf (Printf.sprintf "Invalid atom character '%c'" c) }
| eof
{ Eof }
and start_quoted_string = parse
| "\\|"
{ block_string_start With_escape_sequences lexbuf }
| "\\>"
{ block_string_start Raw lexbuf }
| ""
{ quoted_string lexbuf }
and block_string_start kind = parse
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
block_string_after_newline lexbuf
}
| ' '
{ match kind with
| With_escape_sequences -> block_string lexbuf
| Raw -> raw_block_string lexbuf
}
| eof
{ Buffer.contents escaped_buf
}
| _
{ error lexbuf "There must be at least one space after \"\\|"
}
and block_string = parse
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
block_string_after_newline lexbuf
}
| '\\'
{ match escape_sequence lexbuf with
| Newline -> block_string_after_newline lexbuf
| Other -> block_string lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
block_string lexbuf
}
| eof
{ Buffer.contents escaped_buf
}
and block_string_after_newline = parse
| blank* "\"\\|"
{ block_string_start With_escape_sequences lexbuf }
| blank* "\"\\>"
{ block_string_start Raw lexbuf }
| ""
{ Buffer.contents escaped_buf
}
and raw_block_string = parse
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
block_string_after_newline lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
raw_block_string lexbuf
}
| eof
{ Buffer.contents escaped_buf
}
and quoted_string = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence lexbuf with
| Newline -> quoted_string_after_escaped_newline lexbuf
| Other -> quoted_string lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string lexbuf
}
| eof
{ error lexbuf "unterminated quoted string"
}
and escape_sequence = parse
| newline
{ Lexing.new_line lexbuf;
Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit digit digit
{ error lexbuf "escape sequence in quoted string out of range" ~delta:(-1);
}
| digit*
{ error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit*
{ error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
}
| _
{ error lexbuf "unknown escape sequence" ~delta:(-1);
}
| eof
{ error lexbuf "unterminated escape sequence" ~delta:(-1);
}
and quoted_string_after_escaped_newline = parse
| [' ' '\t']*
{ quoted_string lexbuf }

View File

@ -0,0 +1 @@
let token _ = failwith "Unused during bootstrap"

View File

@ -0,0 +1 @@
val token : Lexer_shared.t

166
src/usexp/jbuild_lexer.mll Normal file
View File

@ -0,0 +1,166 @@
{
open Lexer_shared
(* The difference between the old and new syntax is that the old
syntax allows backslash following by any characters other than 'n',
'x', ... and interpret it as it. The new syntax is stricter in
order to allow introducing new escape sequence in the future if
needed. *)
type escape_mode =
| In_block_comment (* Inside #|...|# comments (old syntax) *)
| In_quoted_string
}
let comment = ';' [^ '\n' '\r']*
let newline = '\r'? '\n'
let blank = [' ' '\t' '\012']
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char =
[^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
(* rule for jbuild files *)
rule token = parse
| newline
{ Lexing.new_line lexbuf; token lexbuf }
| blank+ | comment
{ token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = quoted_string In_quoted_string lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| "#|"
{ block_comment lexbuf;
token lexbuf
}
| "#;"
{ Sexp_comment }
| eof
{ Eof }
| ""
{ atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
and atom acc start = parse
| '#'+ '|'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild atoms cannot contain #|"
}
| '|'+ '#'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild atoms cannot contain |#"
}
| ('#'+ | '|'+ | (atom_char # ['|' '#'])) as s
{ atom (if acc = "" then s else acc ^ s) start lexbuf
}
| ""
{ if acc = "" then
error lexbuf "Internal error in the S-expression parser, \
please report upstream.";
lexbuf.lex_start_p <- start;
Token.Atom (Atom.of_string acc)
}
and quoted_string mode = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence mode lexbuf with
| Newline -> quoted_string_after_escaped_newline mode lexbuf
| Other -> quoted_string mode lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string mode lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string mode lexbuf
}
| eof
{ if mode = In_block_comment then
error lexbuf "unterminated quoted string";
Buffer.contents escaped_buf
}
and quoted_string_after_escaped_newline mode = parse
| [' ' '\t']*
{ quoted_string mode lexbuf }
and block_comment = parse
| '"'
{ Buffer.clear escaped_buf;
ignore (quoted_string In_block_comment lexbuf : string);
block_comment lexbuf
}
| "|#"
{ ()
}
| eof
{ error lexbuf "unterminated block comment"
}
| _
{ block_comment lexbuf
}
and escape_sequence mode = parse
| newline
{ Lexing.new_line lexbuf;
Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if mode = In_quoted_string && v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit* as s
{ if mode = In_quoted_string then
error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit* as s
{ if mode = In_quoted_string then
error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| _ as c
{ Buffer.add_char escaped_buf '\\';
Buffer.add_char escaped_buf c;
Other
}
| eof
{ if mode = In_quoted_string then
error lexbuf "unterminated escape sequence" ~delta:(-1);
Other
}

4
src/usexp/lexer.ml Normal file
View File

@ -0,0 +1,4 @@
include Lexer_shared
let token = Dune_lexer.token
let jbuild_token = Jbuild_lexer.token

View File

@ -1,318 +0,0 @@
{
module Token = struct
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error = struct
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
exception Error of Error.t
let error ?(delta=0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
raise
(Error { start = { start with pos_cnum = start.pos_cnum + delta }
; stop = Lexing.lexeme_end_p lexbuf
; message
})
(* The difference between the old and new syntax is that the old
syntax allows backslash following by any characters other than 'n',
'x', ... and interpret it as it. The new syntax is stricter in
order to allow introducing new escape sequence in the future if
needed. *)
type escape_mode =
| In_block_comment (* Inside #|...|# comments (old syntax) *)
| Old_syntax
| New_syntax
let eval_decimal_char c = Char.code c - Char.code '0'
let eval_decimal_escape c1 c2 c3 =
(eval_decimal_char c1) * 100 +
(eval_decimal_char c2) * 10 +
(eval_decimal_char c3)
let eval_hex_char c =
match c with
| '0'..'9' -> Char.code c - Char.code '0'
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
| 'A'..'F' -> Char.code c - Char.code 'A' + 10
| _ -> -1
let eval_hex_escape c1 c2 =
(eval_hex_char c1) * 16 +
(eval_hex_char c2)
type escape_sequence =
| Newline
| Other
let escaped_buf = Buffer.create 256
type block_string_line_kind =
| With_escape_sequences
| Raw
}
let comment = ';' [^ '\n' '\r']*
let newline = '\r'? '\n'
let blank = [' ' '\t' '\012']
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char_jbuild =
[^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
let atom_char_dune =
[^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
(* rule for jbuild files *)
rule jbuild_token = parse
| newline
{ Lexing.new_line lexbuf; jbuild_token lexbuf }
| blank+ | comment
{ jbuild_token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = quoted_string Old_syntax lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| "#|"
{ jbuild_block_comment lexbuf;
jbuild_token lexbuf
}
| "#;"
{ Sexp_comment }
| eof
{ Eof }
| ""
{ jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
and jbuild_atom acc start = parse
| '#'+ '|'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild_atoms cannot contain #|"
}
| '|'+ '#'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild_atoms cannot contain |#"
}
| ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
{ jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
}
| ""
{ if acc = "" then
error lexbuf "Internal error in the S-expression parser, \
please report upstream.";
lexbuf.lex_start_p <- start;
Token.Atom (Atom.of_string acc)
}
and quoted_string mode = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence mode lexbuf with
| Newline -> quoted_string_after_escaped_newline mode lexbuf
| Other -> quoted_string mode lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string mode lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string mode lexbuf
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated quoted string";
Buffer.contents escaped_buf
}
and quoted_string_after_escaped_newline mode = parse
| [' ' '\t']*
{ quoted_string mode lexbuf }
and escape_sequence mode = parse
| newline
{ Lexing.new_line lexbuf;
Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if mode <> In_block_comment && v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| _ as c
{ if mode = New_syntax then
error lexbuf "unknown escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_char escaped_buf c;
Other
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated escape sequence" ~delta:(-1);
Other
}
and jbuild_block_comment = parse
| '"'
{ Buffer.clear escaped_buf;
ignore (quoted_string In_block_comment lexbuf : string);
jbuild_block_comment lexbuf
}
| "|#"
{ ()
}
| eof
{ error lexbuf "unterminated block comment"
}
| _
{ jbuild_block_comment lexbuf
}
(* rule for dune files *)
and token = parse
| newline
{ Lexing.new_line lexbuf; token lexbuf }
| blank+ | comment
{ token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = dune_quoted_string lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| atom_char_dune+ as s
{ Token.Atom (Atom.of_string s) }
| _ as c { error lexbuf (Printf.sprintf "Invalid atom character '%c'" c) }
| eof
{ Eof }
and dune_quoted_string = parse
| "\\|"
{ block_string_start With_escape_sequences lexbuf }
| "\\>"
{ block_string_start Raw lexbuf }
| ""
{ quoted_string New_syntax lexbuf }
and block_string_start kind = parse
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
block_string_after_newline lexbuf
}
| ' '
{ match kind with
| With_escape_sequences -> block_string lexbuf
| Raw -> raw_block_string lexbuf
}
| eof
{ Buffer.contents escaped_buf
}
| _
{ error lexbuf "There must be at least one space after \"\\|"
}
and block_string = parse
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
block_string_after_newline lexbuf
}
| '\\'
{ match escape_sequence New_syntax lexbuf with
| Newline -> block_string_after_newline lexbuf
| Other -> block_string lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
block_string lexbuf
}
| eof
{ Buffer.contents escaped_buf
}
and block_string_after_newline = parse
| blank* "\"\\|"
{ block_string_start With_escape_sequences lexbuf }
| blank* "\"\\>"
{ block_string_start Raw lexbuf }
| ""
{ Buffer.contents escaped_buf
}
and raw_block_string = parse
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
block_string_after_newline lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
raw_block_string lexbuf
}
| eof
{ Buffer.contents escaped_buf
}

53
src/usexp/lexer_shared.ml Normal file
View File

@ -0,0 +1,53 @@
module Token = struct
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error = struct
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
exception Error of Error.t
let error ?(delta=0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
raise
(Error { start = { start with pos_cnum = start.pos_cnum + delta }
; stop = Lexing.lexeme_end_p lexbuf
; message
})
let escaped_buf = Buffer.create 256
type escape_sequence =
| Newline
| Other
let eval_decimal_char c = Char.code c - Char.code '0'
let eval_decimal_escape c1 c2 c3 =
(eval_decimal_char c1) * 100 +
(eval_decimal_char c2) * 10 +
(eval_decimal_char c3)
let eval_hex_char c =
match c with
| '0'..'9' -> Char.code c - Char.code '0'
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
| 'A'..'F' -> Char.code c - Char.code 'A' + 10
| _ -> -1
let eval_hex_escape c1 c2 =
(eval_hex_char c1) * 16 +
(eval_hex_char c2)

View File

@ -0,0 +1,35 @@
module Token : sig
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error : sig
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
val error : ?delta:int -> Lexing.lexbuf -> string -> 'a
val escaped_buf : Buffer.t
exception Error of Error.t
type escape_sequence =
| Newline
| Other
val eval_decimal_char : char -> int
val eval_decimal_escape : char -> char -> char -> int
val eval_hex_escape : char -> char -> int

View File

@ -107,14 +107,14 @@ parse {|x#|y|}
[%%expect{|
- : parse_result =
Different
{jbuild = Error "jbuild_atoms cannot contain #|"; dune = Ok [(atom x#|y)]}
{jbuild = Error "jbuild atoms cannot contain #|"; dune = Ok [(atom x#|y)]}
|}]
parse {|x|#y|}
[%%expect{|
- : parse_result =
Different
{jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [(atom x|#y)]}
{jbuild = Error "jbuild atoms cannot contain |#"; dune = Ok [(atom x|#y)]}
|}]
parse {|"\a"|}