Split dune and jbuild lexers

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
This commit is contained in:
Rudi Grinberg 2018-06-20 19:29:16 +06:30
parent 411552e4c7
commit e3aa13424d
6 changed files with 361 additions and 218 deletions

View File

@ -3,4 +3,4 @@
(synopsis "[Internal] S-expression library")
(public_name dune.usexp)))
(ocamllex (lexer))
(ocamllex (dune_lexer jbuild_lexer))

View File

@ -1,71 +1,5 @@
{
module Token = struct
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error = struct
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
exception Error of Error.t
let error ?(delta=0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
raise
(Error { start = { start with pos_cnum = start.pos_cnum + delta }
; stop = Lexing.lexeme_end_p lexbuf
; message
})
(* The difference between the old and new syntax is that the old
syntax allows backslash following by any characters other than 'n',
'x', ... and interpret it as it. The new syntax is stricter in
order to allow introducing new escape sequence in the future if
needed. *)
type escape_mode =
| In_block_comment (* Inside #|...|# comments (old syntax) *)
| Old_syntax
| New_syntax
let eval_decimal_char c = Char.code c - Char.code '0'
let eval_decimal_escape c1 c2 c3 =
(eval_decimal_char c1) * 100 +
(eval_decimal_char c2) * 10 +
(eval_decimal_char c3)
let eval_hex_char c =
match c with
| '0'..'9' -> Char.code c - Char.code '0'
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
| 'A'..'F' -> Char.code c - Char.code 'A' + 10
| _ -> -1
let eval_hex_escape c1 c2 =
(eval_hex_char c1) * 16 +
(eval_hex_char c2)
type escape_sequence =
| Newline
| Other
let escaped_buf = Buffer.create 256
type block_string_line_kind =
| With_escape_sequences
| Raw
open Lexer0
}
let comment = ';' [^ '\n' '\r']*
@ -74,160 +8,11 @@ let blank = [' ' '\t' '\012']
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char_jbuild =
[^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
let atom_char_dune =
[^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
(* rule for jbuild files *)
rule jbuild_token = parse
| newline
{ Lexing.new_line lexbuf; jbuild_token lexbuf }
| blank+ | comment
{ jbuild_token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = quoted_string Old_syntax lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| "#|"
{ jbuild_block_comment lexbuf;
jbuild_token lexbuf
}
| "#;"
{ Sexp_comment }
| eof
{ Eof }
| ""
{ jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
and jbuild_atom acc start = parse
| '#'+ '|'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild_atoms cannot contain #|"
}
| '|'+ '#'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild_atoms cannot contain |#"
}
| ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
{ jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
}
| ""
{ if acc = "" then
error lexbuf "Internal error in the S-expression parser, \
please report upstream.";
lexbuf.lex_start_p <- start;
Token.Atom (Atom.of_string acc)
}
and quoted_string mode = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence mode lexbuf with
| Newline -> quoted_string_after_escaped_newline mode lexbuf
| Other -> quoted_string mode lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string mode lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string mode lexbuf
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated quoted string";
Buffer.contents escaped_buf
}
and quoted_string_after_escaped_newline mode = parse
| [' ' '\t']*
{ quoted_string mode lexbuf }
and escape_sequence mode = parse
| newline
{ Lexing.new_line lexbuf;
Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if mode <> In_block_comment && v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| _ as c
{ if mode = New_syntax then
error lexbuf "unknown escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_char escaped_buf c;
Other
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated escape sequence" ~delta:(-1);
Other
}
and jbuild_block_comment = parse
| '"'
{ Buffer.clear escaped_buf;
ignore (quoted_string In_block_comment lexbuf : string);
jbuild_block_comment lexbuf
}
| "|#"
{ ()
}
| eof
{ error lexbuf "unterminated block comment"
}
| _
{ jbuild_block_comment lexbuf
}
(* rule for dune files *)
and token = parse
rule token = parse
| newline
{ Lexing.new_line lexbuf; token lexbuf }
| blank+ | comment
@ -316,3 +101,86 @@ and raw_block_string = parse
| eof
{ Buffer.contents escaped_buf
}
and quoted_string mode = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence mode lexbuf with
| Newline -> quoted_string_after_escaped_newline mode lexbuf
| Other -> quoted_string mode lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string mode lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string mode lexbuf
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated quoted string";
Buffer.contents escaped_buf
}
and escape_sequence mode = parse
| newline
{ Lexing.new_line lexbuf;
Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if mode <> In_block_comment && v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| _ as c
{ if mode = New_syntax then
error lexbuf "unknown escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_char escaped_buf c;
Other
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated escape sequence" ~delta:(-1);
Other
}
and quoted_string_after_escaped_newline mode = parse
| [' ' '\t']*
{ quoted_string mode lexbuf }

159
src/usexp/jbuild_lexer.mll Normal file
View File

@ -0,0 +1,159 @@
{
open Lexer0
}
let comment = ';' [^ '\n' '\r']*
let newline = '\r'? '\n'
let blank = [' ' '\t' '\012']
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char_jbuild =
[^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
(* rule for jbuild files *)
rule token = parse
| newline
{ Lexing.new_line lexbuf; token lexbuf }
| blank+ | comment
{ token lexbuf }
| '('
{ Token.Lparen }
| ')'
{ Rparen }
| '"'
{ Buffer.clear escaped_buf;
let start = Lexing.lexeme_start_p lexbuf in
let s = quoted_string Old_syntax lexbuf in
lexbuf.lex_start_p <- start;
Quoted_string s
}
| "#|"
{ jbuild_block_comment lexbuf;
token lexbuf
}
| "#;"
{ Sexp_comment }
| eof
{ Eof }
| ""
{ jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
and jbuild_atom acc start = parse
| '#'+ '|'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild_atoms cannot contain #|"
}
| '|'+ '#'
{ lexbuf.lex_start_p <- start;
error lexbuf "jbuild_atoms cannot contain |#"
}
| ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
{ jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
}
| ""
{ if acc = "" then
error lexbuf "Internal error in the S-expression parser, \
please report upstream.";
lexbuf.lex_start_p <- start;
Token.Atom (Atom.of_string acc)
}
and quoted_string mode = parse
| '"'
{ Buffer.contents escaped_buf }
| '\\'
{ match escape_sequence mode lexbuf with
| Newline -> quoted_string_after_escaped_newline mode lexbuf
| Other -> quoted_string mode lexbuf
}
| newline as s
{ Lexing.new_line lexbuf;
Buffer.add_string escaped_buf s;
quoted_string mode lexbuf
}
| _ as c
{ Buffer.add_char escaped_buf c;
quoted_string mode lexbuf
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated quoted string";
Buffer.contents escaped_buf
}
and quoted_string_after_escaped_newline mode = parse
| [' ' '\t']*
{ quoted_string mode lexbuf }
and jbuild_block_comment = parse
| '"'
{ Buffer.clear escaped_buf;
ignore (quoted_string In_block_comment lexbuf : string);
jbuild_block_comment lexbuf
}
| "|#"
{ ()
}
| eof
{ error lexbuf "unterminated block comment"
}
| _
{ jbuild_block_comment lexbuf
}
and escape_sequence mode = parse
| newline
{ Lexing.new_line lexbuf;
Newline }
| ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
{ let c =
match c with
| 'n' -> '\n'
| 'r' -> '\r'
| 'b' -> '\b'
| 't' -> '\t'
| _ -> c
in
Buffer.add_char escaped_buf c;
Other
}
| (digit as c1) (digit as c2) (digit as c3)
{ let v = eval_decimal_escape c1 c2 c3 in
if mode <> In_block_comment && v > 255 then
error lexbuf "escape sequence in quoted string out of range"
~delta:(-1);
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| digit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| 'x' (hexdigit as c1) (hexdigit as c2)
{ let v = eval_hex_escape c1 c2 in
Buffer.add_char escaped_buf (Char.chr v);
Other
}
| 'x' hexdigit* as s
{ if mode <> In_block_comment then
error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_string escaped_buf s;
Other
}
| _ as c
{ if mode = New_syntax then
error lexbuf "unknown escape sequence" ~delta:(-1);
Buffer.add_char escaped_buf '\\';
Buffer.add_char escaped_buf c;
Other
}
| eof
{ if mode <> In_block_comment then
error lexbuf "unterminated escape sequence" ~delta:(-1);
Other
}

4
src/usexp/lexer.ml Normal file
View File

@ -0,0 +1,4 @@
include Lexer0
let token = Dune_lexer.token
let jbuild_token = Jbuild_lexer.token

68
src/usexp/lexer0.ml Normal file
View File

@ -0,0 +1,68 @@
module Token = struct
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error = struct
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
exception Error of Error.t
let error ?(delta=0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
raise
(Error { start = { start with pos_cnum = start.pos_cnum + delta }
; stop = Lexing.lexeme_end_p lexbuf
; message
})
let escaped_buf = Buffer.create 256
(* The difference between the old and new syntax is that the old
syntax allows backslash following by any characters other than 'n',
'x', ... and interpret it as it. The new syntax is stricter in
order to allow introducing new escape sequence in the future if
needed. *)
type escape_mode =
| In_block_comment (* Inside #|...|# comments (old syntax) *)
| Old_syntax
| New_syntax
type escape_sequence =
| Newline
| Other
type block_string_line_kind =
| With_escape_sequences
| Raw
let eval_decimal_char c = Char.code c - Char.code '0'
let eval_decimal_escape c1 c2 c3 =
(eval_decimal_char c1) * 100 +
(eval_decimal_char c2) * 10 +
(eval_decimal_char c3)
let eval_hex_char c =
match c with
| '0'..'9' -> Char.code c - Char.code '0'
| 'a'..'f' -> Char.code c - Char.code 'a' + 10
| 'A'..'F' -> Char.code c - Char.code 'A' + 10
| _ -> -1
let eval_hex_escape c1 c2 =
(eval_hex_char c1) * 16 +
(eval_hex_char c2)

44
src/usexp/lexer0.mli Normal file
View File

@ -0,0 +1,44 @@
module Token : sig
type t =
| Atom of Atom.t
| Quoted_string of string
| Lparen
| Rparen
| Sexp_comment
| Eof
end
type t = Lexing.lexbuf -> Token.t
module Error : sig
type t =
{ start : Lexing.position
; stop : Lexing.position
; message : string
}
end
val error : ?delta:int -> Lexing.lexbuf -> string -> 'a
val escaped_buf : Buffer.t
exception Error of Error.t
type escape_mode =
| In_block_comment (* Inside #|...|# comments (old syntax) *)
| Old_syntax
| New_syntax
type escape_sequence =
| Newline
| Other
type block_string_line_kind =
| With_escape_sequences
| Raw
val eval_decimal_char : char -> int
val eval_decimal_escape : char -> char -> char -> int
val eval_hex_escape : char -> char -> int