From e3aa13424d612ceb616c2323dded69434f8a271a Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 19:29:16 +0630 Subject: [PATCH 1/7] Split dune and jbuild lexers Signed-off-by: Rudi Grinberg --- src/usexp/dune | 2 +- src/usexp/{lexer.mll => dune_lexer.mll} | 302 +++++++----------------- src/usexp/jbuild_lexer.mll | 159 +++++++++++++ src/usexp/lexer.ml | 4 + src/usexp/lexer0.ml | 68 ++++++ src/usexp/lexer0.mli | 44 ++++ 6 files changed, 361 insertions(+), 218 deletions(-) rename src/usexp/{lexer.mll => dune_lexer.mll} (59%) create mode 100644 src/usexp/jbuild_lexer.mll create mode 100644 src/usexp/lexer.ml create mode 100644 src/usexp/lexer0.ml create mode 100644 src/usexp/lexer0.mli diff --git a/src/usexp/dune b/src/usexp/dune index a26d42f0..bd71f71a 100644 --- a/src/usexp/dune +++ b/src/usexp/dune @@ -3,4 +3,4 @@ (synopsis "[Internal] S-expression library") (public_name dune.usexp))) -(ocamllex (lexer)) +(ocamllex (dune_lexer jbuild_lexer)) diff --git a/src/usexp/lexer.mll b/src/usexp/dune_lexer.mll similarity index 59% rename from src/usexp/lexer.mll rename to src/usexp/dune_lexer.mll index 2c9098b0..3bf74724 100644 --- a/src/usexp/lexer.mll +++ b/src/usexp/dune_lexer.mll @@ -1,71 +1,5 @@ { -module Token = struct - type t = - | Atom of Atom.t - | Quoted_string of string - | Lparen - | Rparen - | Sexp_comment - | Eof -end - -type t = Lexing.lexbuf -> Token.t - -module Error = struct - type t = - { start : Lexing.position - ; stop : Lexing.position - ; message : string - } -end - -exception Error of Error.t - -let error ?(delta=0) lexbuf message = - let start = Lexing.lexeme_start_p lexbuf in - raise - (Error { start = { start with pos_cnum = start.pos_cnum + delta } - ; stop = Lexing.lexeme_end_p lexbuf - ; message - }) - -(* The difference between the old and new syntax is that the old - syntax allows backslash following by any characters other than 'n', - 'x', ... and interpret it as it. The new syntax is stricter in - order to allow introducing new escape sequence in the future if - needed. *) -type escape_mode = - | In_block_comment (* Inside #|...|# comments (old syntax) *) - | Old_syntax - | New_syntax - -let eval_decimal_char c = Char.code c - Char.code '0' - -let eval_decimal_escape c1 c2 c3 = - (eval_decimal_char c1) * 100 + - (eval_decimal_char c2) * 10 + - (eval_decimal_char c3) - -let eval_hex_char c = - match c with - | '0'..'9' -> Char.code c - Char.code '0' - | 'a'..'f' -> Char.code c - Char.code 'a' + 10 - | 'A'..'F' -> Char.code c - Char.code 'A' + 10 - | _ -> -1 - -let eval_hex_escape c1 c2 = - (eval_hex_char c1) * 16 + - (eval_hex_char c2) - -type escape_sequence = - | Newline - | Other - -let escaped_buf = Buffer.create 256 - -type block_string_line_kind = - | With_escape_sequences - | Raw + open Lexer0 } let comment = ';' [^ '\n' '\r']* @@ -74,160 +8,11 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char_jbuild = - [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012'] let atom_char_dune = [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] -(* rule for jbuild files *) -rule jbuild_token = parse - | newline - { Lexing.new_line lexbuf; jbuild_token lexbuf } - | blank+ | comment - { jbuild_token lexbuf } - | '(' - { Token.Lparen } - | ')' - { Rparen } - | '"' - { Buffer.clear escaped_buf; - let start = Lexing.lexeme_start_p lexbuf in - let s = quoted_string Old_syntax lexbuf in - lexbuf.lex_start_p <- start; - Quoted_string s - } - | "#|" - { jbuild_block_comment lexbuf; - jbuild_token lexbuf - } - | "#;" - { Sexp_comment } - | eof - { Eof } - | "" - { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf } - -and jbuild_atom acc start = parse - | '#'+ '|' - { lexbuf.lex_start_p <- start; - error lexbuf "jbuild_atoms cannot contain #|" - } - | '|'+ '#' - { lexbuf.lex_start_p <- start; - error lexbuf "jbuild_atoms cannot contain |#" - } - | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s - { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf - } - | "" - { if acc = "" then - error lexbuf "Internal error in the S-expression parser, \ - please report upstream."; - lexbuf.lex_start_p <- start; - Token.Atom (Atom.of_string acc) - } - -and quoted_string mode = parse - | '"' - { Buffer.contents escaped_buf } - | '\\' - { match escape_sequence mode lexbuf with - | Newline -> quoted_string_after_escaped_newline mode lexbuf - | Other -> quoted_string mode lexbuf - } - | newline as s - { Lexing.new_line lexbuf; - Buffer.add_string escaped_buf s; - quoted_string mode lexbuf - } - | _ as c - { Buffer.add_char escaped_buf c; - quoted_string mode lexbuf - } - | eof - { if mode <> In_block_comment then - error lexbuf "unterminated quoted string"; - Buffer.contents escaped_buf - } - -and quoted_string_after_escaped_newline mode = parse - | [' ' '\t']* - { quoted_string mode lexbuf } - -and escape_sequence mode = parse - | newline - { Lexing.new_line lexbuf; - Newline } - | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c - { let c = - match c with - | 'n' -> '\n' - | 'r' -> '\r' - | 'b' -> '\b' - | 't' -> '\t' - | _ -> c - in - Buffer.add_char escaped_buf c; - Other - } - | (digit as c1) (digit as c2) (digit as c3) - { let v = eval_decimal_escape c1 c2 c3 in - if mode <> In_block_comment && v > 255 then - error lexbuf "escape sequence in quoted string out of range" - ~delta:(-1); - Buffer.add_char escaped_buf (Char.chr v); - Other - } - | digit* as s - { if mode <> In_block_comment then - error lexbuf "unterminated decimal escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_string escaped_buf s; - Other - } - | 'x' (hexdigit as c1) (hexdigit as c2) - { let v = eval_hex_escape c1 c2 in - Buffer.add_char escaped_buf (Char.chr v); - Other - } - | 'x' hexdigit* as s - { if mode <> In_block_comment then - error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_string escaped_buf s; - Other - } - | _ as c - { if mode = New_syntax then - error lexbuf "unknown escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_char escaped_buf c; - Other - } - | eof - { if mode <> In_block_comment then - error lexbuf "unterminated escape sequence" ~delta:(-1); - Other - } - -and jbuild_block_comment = parse - | '"' - { Buffer.clear escaped_buf; - ignore (quoted_string In_block_comment lexbuf : string); - jbuild_block_comment lexbuf - } - | "|#" - { () - } - | eof - { error lexbuf "unterminated block comment" - } - | _ - { jbuild_block_comment lexbuf - } - (* rule for dune files *) -and token = parse +rule token = parse | newline { Lexing.new_line lexbuf; token lexbuf } | blank+ | comment @@ -316,3 +101,86 @@ and raw_block_string = parse | eof { Buffer.contents escaped_buf } + +and quoted_string mode = parse + | '"' + { Buffer.contents escaped_buf } + | '\\' + { match escape_sequence mode lexbuf with + | Newline -> quoted_string_after_escaped_newline mode lexbuf + | Other -> quoted_string mode lexbuf + } + | newline as s + { Lexing.new_line lexbuf; + Buffer.add_string escaped_buf s; + quoted_string mode lexbuf + } + | _ as c + { Buffer.add_char escaped_buf c; + quoted_string mode lexbuf + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated quoted string"; + Buffer.contents escaped_buf + } + +and escape_sequence mode = parse + | newline + { Lexing.new_line lexbuf; + Newline } + | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c + { let c = + match c with + | 'n' -> '\n' + | 'r' -> '\r' + | 'b' -> '\b' + | 't' -> '\t' + | _ -> c + in + Buffer.add_char escaped_buf c; + Other + } + | (digit as c1) (digit as c2) (digit as c3) + { let v = eval_decimal_escape c1 c2 c3 in + if mode <> In_block_comment && v > 255 then + error lexbuf "escape sequence in quoted string out of range" + ~delta:(-1); + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | digit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated decimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | 'x' (hexdigit as c1) (hexdigit as c2) + { let v = eval_hex_escape c1 c2 in + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | 'x' hexdigit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | _ as c + { if mode = New_syntax then + error lexbuf "unknown escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_char escaped_buf c; + Other + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated escape sequence" ~delta:(-1); + Other + } + +and quoted_string_after_escaped_newline mode = parse + | [' ' '\t']* + { quoted_string mode lexbuf } diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll new file mode 100644 index 00000000..51376bd9 --- /dev/null +++ b/src/usexp/jbuild_lexer.mll @@ -0,0 +1,159 @@ +{ + open Lexer0 +} + +let comment = ';' [^ '\n' '\r']* +let newline = '\r'? '\n' +let blank = [' ' '\t' '\012'] +let digit = ['0'-'9'] +let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] + +let atom_char_jbuild = + [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012'] + +(* rule for jbuild files *) +rule token = parse + | newline + { Lexing.new_line lexbuf; token lexbuf } + | blank+ | comment + { token lexbuf } + | '(' + { Token.Lparen } + | ')' + { Rparen } + | '"' + { Buffer.clear escaped_buf; + let start = Lexing.lexeme_start_p lexbuf in + let s = quoted_string Old_syntax lexbuf in + lexbuf.lex_start_p <- start; + Quoted_string s + } + | "#|" + { jbuild_block_comment lexbuf; + token lexbuf + } + | "#;" + { Sexp_comment } + | eof + { Eof } + | "" + { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf } + +and jbuild_atom acc start = parse + | '#'+ '|' + { lexbuf.lex_start_p <- start; + error lexbuf "jbuild_atoms cannot contain #|" + } + | '|'+ '#' + { lexbuf.lex_start_p <- start; + error lexbuf "jbuild_atoms cannot contain |#" + } + | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s + { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf + } + | "" + { if acc = "" then + error lexbuf "Internal error in the S-expression parser, \ + please report upstream."; + lexbuf.lex_start_p <- start; + Token.Atom (Atom.of_string acc) + } + +and quoted_string mode = parse + | '"' + { Buffer.contents escaped_buf } + | '\\' + { match escape_sequence mode lexbuf with + | Newline -> quoted_string_after_escaped_newline mode lexbuf + | Other -> quoted_string mode lexbuf + } + | newline as s + { Lexing.new_line lexbuf; + Buffer.add_string escaped_buf s; + quoted_string mode lexbuf + } + | _ as c + { Buffer.add_char escaped_buf c; + quoted_string mode lexbuf + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated quoted string"; + Buffer.contents escaped_buf + } + +and quoted_string_after_escaped_newline mode = parse + | [' ' '\t']* + { quoted_string mode lexbuf } + +and jbuild_block_comment = parse + | '"' + { Buffer.clear escaped_buf; + ignore (quoted_string In_block_comment lexbuf : string); + jbuild_block_comment lexbuf + } + | "|#" + { () + } + | eof + { error lexbuf "unterminated block comment" + } + | _ + { jbuild_block_comment lexbuf + } + +and escape_sequence mode = parse + | newline + { Lexing.new_line lexbuf; + Newline } + | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c + { let c = + match c with + | 'n' -> '\n' + | 'r' -> '\r' + | 'b' -> '\b' + | 't' -> '\t' + | _ -> c + in + Buffer.add_char escaped_buf c; + Other + } + | (digit as c1) (digit as c2) (digit as c3) + { let v = eval_decimal_escape c1 c2 c3 in + if mode <> In_block_comment && v > 255 then + error lexbuf "escape sequence in quoted string out of range" + ~delta:(-1); + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | digit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated decimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | 'x' (hexdigit as c1) (hexdigit as c2) + { let v = eval_hex_escape c1 c2 in + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | 'x' hexdigit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | _ as c + { if mode = New_syntax then + error lexbuf "unknown escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_char escaped_buf c; + Other + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated escape sequence" ~delta:(-1); + Other + } diff --git a/src/usexp/lexer.ml b/src/usexp/lexer.ml new file mode 100644 index 00000000..aff4e6d2 --- /dev/null +++ b/src/usexp/lexer.ml @@ -0,0 +1,4 @@ +include Lexer0 + +let token = Dune_lexer.token +let jbuild_token = Jbuild_lexer.token diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer0.ml new file mode 100644 index 00000000..5ebc545a --- /dev/null +++ b/src/usexp/lexer0.ml @@ -0,0 +1,68 @@ + +module Token = struct + type t = + | Atom of Atom.t + | Quoted_string of string + | Lparen + | Rparen + | Sexp_comment + | Eof +end + +type t = Lexing.lexbuf -> Token.t + +module Error = struct + type t = + { start : Lexing.position + ; stop : Lexing.position + ; message : string + } +end + +exception Error of Error.t + +let error ?(delta=0) lexbuf message = + let start = Lexing.lexeme_start_p lexbuf in + raise + (Error { start = { start with pos_cnum = start.pos_cnum + delta } + ; stop = Lexing.lexeme_end_p lexbuf + ; message + }) + +let escaped_buf = Buffer.create 256 + +(* The difference between the old and new syntax is that the old + syntax allows backslash following by any characters other than 'n', + 'x', ... and interpret it as it. The new syntax is stricter in + order to allow introducing new escape sequence in the future if + needed. *) +type escape_mode = + | In_block_comment (* Inside #|...|# comments (old syntax) *) + | Old_syntax + | New_syntax + +type escape_sequence = + | Newline + | Other + +type block_string_line_kind = + | With_escape_sequences + | Raw + +let eval_decimal_char c = Char.code c - Char.code '0' + +let eval_decimal_escape c1 c2 c3 = + (eval_decimal_char c1) * 100 + + (eval_decimal_char c2) * 10 + + (eval_decimal_char c3) + +let eval_hex_char c = + match c with + | '0'..'9' -> Char.code c - Char.code '0' + | 'a'..'f' -> Char.code c - Char.code 'a' + 10 + | 'A'..'F' -> Char.code c - Char.code 'A' + 10 + | _ -> -1 + +let eval_hex_escape c1 c2 = + (eval_hex_char c1) * 16 + + (eval_hex_char c2) diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer0.mli new file mode 100644 index 00000000..fc15c06a --- /dev/null +++ b/src/usexp/lexer0.mli @@ -0,0 +1,44 @@ +module Token : sig + type t = + | Atom of Atom.t + | Quoted_string of string + | Lparen + | Rparen + | Sexp_comment + | Eof +end + +type t = Lexing.lexbuf -> Token.t + +module Error : sig + type t = + { start : Lexing.position + ; stop : Lexing.position + ; message : string + } +end + +val error : ?delta:int -> Lexing.lexbuf -> string -> 'a + +val escaped_buf : Buffer.t + +exception Error of Error.t + +type escape_mode = + | In_block_comment (* Inside #|...|# comments (old syntax) *) + | Old_syntax + | New_syntax + +type escape_sequence = + | Newline + | Other + +type block_string_line_kind = + | With_escape_sequences + | Raw + +val eval_decimal_char : char -> int + +val eval_decimal_escape : char -> char -> char -> int + +val eval_hex_escape : char -> char -> int From d7ab3d962c9aedbd0f79e919c05e3c65cf6404f5 Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 19:37:08 +0630 Subject: [PATCH 2/7] Harmonize names in dune and jbuild lexers Signed-off-by: Rudi Grinberg --- src/usexp/dune_lexer.mll | 9 ++++----- src/usexp/jbuild_lexer.mll | 22 +++++++++++----------- test/unit-tests/sexp.mlt | 4 ++-- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll index 3bf74724..2e43b3da 100644 --- a/src/usexp/dune_lexer.mll +++ b/src/usexp/dune_lexer.mll @@ -8,10 +8,9 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char_dune = +let atom_char = [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] -(* rule for dune files *) rule token = parse | newline { Lexing.new_line lexbuf; token lexbuf } @@ -24,17 +23,17 @@ rule token = parse | '"' { Buffer.clear escaped_buf; let start = Lexing.lexeme_start_p lexbuf in - let s = dune_quoted_string lexbuf in + let s = start_quoted_string lexbuf in lexbuf.lex_start_p <- start; Quoted_string s } - | atom_char_dune+ as s + | atom_char+ as s { Token.Atom (Atom.of_string s) } | _ as c { error lexbuf (Printf.sprintf "Invalid atom character '%c'" c) } | eof { Eof } -and dune_quoted_string = parse +and start_quoted_string = parse | "\\|" { block_string_start With_escape_sequences lexbuf } | "\\>" diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll index 51376bd9..4a2d546e 100644 --- a/src/usexp/jbuild_lexer.mll +++ b/src/usexp/jbuild_lexer.mll @@ -8,7 +8,7 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char_jbuild = +let atom_char = [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012'] (* rule for jbuild files *) @@ -29,7 +29,7 @@ rule token = parse Quoted_string s } | "#|" - { jbuild_block_comment lexbuf; + { block_comment lexbuf; token lexbuf } | "#;" @@ -37,19 +37,19 @@ rule token = parse | eof { Eof } | "" - { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf } + { atom "" (Lexing.lexeme_start_p lexbuf) lexbuf } -and jbuild_atom acc start = parse +and atom acc start = parse | '#'+ '|' { lexbuf.lex_start_p <- start; - error lexbuf "jbuild_atoms cannot contain #|" + error lexbuf "jbuild atoms cannot contain #|" } | '|'+ '#' { lexbuf.lex_start_p <- start; - error lexbuf "jbuild_atoms cannot contain |#" + error lexbuf "jbuild atoms cannot contain |#" } - | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s - { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf + | ('#'+ | '|'+ | (atom_char # ['|' '#'])) as s + { atom (if acc = "" then s else acc ^ s) start lexbuf } | "" { if acc = "" then @@ -86,11 +86,11 @@ and quoted_string_after_escaped_newline mode = parse | [' ' '\t']* { quoted_string mode lexbuf } -and jbuild_block_comment = parse +and block_comment = parse | '"' { Buffer.clear escaped_buf; ignore (quoted_string In_block_comment lexbuf : string); - jbuild_block_comment lexbuf + block_comment lexbuf } | "|#" { () @@ -99,7 +99,7 @@ and jbuild_block_comment = parse { error lexbuf "unterminated block comment" } | _ - { jbuild_block_comment lexbuf + { block_comment lexbuf } and escape_sequence mode = parse diff --git a/test/unit-tests/sexp.mlt b/test/unit-tests/sexp.mlt index 79db5495..cc42cb6d 100644 --- a/test/unit-tests/sexp.mlt +++ b/test/unit-tests/sexp.mlt @@ -107,14 +107,14 @@ parse {|x#|y|} [%%expect{| - : parse_result = Different - {jbuild = Error "jbuild_atoms cannot contain #|"; dune = Ok [(atom x#|y)]} + {jbuild = Error "jbuild atoms cannot contain #|"; dune = Ok [(atom x#|y)]} |}] parse {|x|#y|} [%%expect{| - : parse_result = Different - {jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [(atom x|#y)]} + {jbuild = Error "jbuild atoms cannot contain |#"; dune = Ok [(atom x|#y)]} |}] parse {|"\a"|} From 1e5dc322e2b7a32bb6ee2c03c1c9c857ea4b4973 Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 19:50:58 +0630 Subject: [PATCH 3/7] Remove some legacy stuff from the new dune lexer Signed-off-by: Rudi Grinberg --- src/usexp/dune_lexer.mll | 62 ++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll index 2e43b3da..e6e4dcf0 100644 --- a/src/usexp/dune_lexer.mll +++ b/src/usexp/dune_lexer.mll @@ -8,8 +8,7 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char = - [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] +let atom_char = [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] rule token = parse | newline @@ -39,7 +38,7 @@ and start_quoted_string = parse | "\\>" { block_string_start Raw lexbuf } | "" - { quoted_string New_syntax lexbuf } + { quoted_string lexbuf } and block_string_start kind = parse | newline as s @@ -66,7 +65,7 @@ and block_string = parse block_string_after_newline lexbuf } | '\\' - { match escape_sequence New_syntax lexbuf with + { match escape_sequence lexbuf with | Newline -> block_string_after_newline lexbuf | Other -> block_string lexbuf } @@ -101,30 +100,28 @@ and raw_block_string = parse { Buffer.contents escaped_buf } -and quoted_string mode = parse +and quoted_string = parse | '"' { Buffer.contents escaped_buf } | '\\' - { match escape_sequence mode lexbuf with - | Newline -> quoted_string_after_escaped_newline mode lexbuf - | Other -> quoted_string mode lexbuf + { match escape_sequence lexbuf with + | Newline -> quoted_string_after_escaped_newline lexbuf + | Other -> quoted_string lexbuf } | newline as s { Lexing.new_line lexbuf; Buffer.add_string escaped_buf s; - quoted_string mode lexbuf + quoted_string lexbuf } | _ as c { Buffer.add_char escaped_buf c; - quoted_string mode lexbuf + quoted_string lexbuf } | eof - { if mode <> In_block_comment then - error lexbuf "unterminated quoted string"; - Buffer.contents escaped_buf + { error lexbuf "unterminated quoted string" } -and escape_sequence mode = parse +and escape_sequence = parse | newline { Lexing.new_line lexbuf; Newline } @@ -142,44 +139,33 @@ and escape_sequence mode = parse } | (digit as c1) (digit as c2) (digit as c3) { let v = eval_decimal_escape c1 c2 c3 in - if mode <> In_block_comment && v > 255 then + if v > 255 then error lexbuf "escape sequence in quoted string out of range" ~delta:(-1); Buffer.add_char escaped_buf (Char.chr v); Other } - | digit* as s - { if mode <> In_block_comment then - error lexbuf "unterminated decimal escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_string escaped_buf s; - Other + | digit digit digit + { error lexbuf "escape sequence in quoted string out of range" ~delta:(-1); + } + | digit* + { error lexbuf "unterminated decimal escape sequence" ~delta:(-1); } | 'x' (hexdigit as c1) (hexdigit as c2) { let v = eval_hex_escape c1 c2 in Buffer.add_char escaped_buf (Char.chr v); Other } - | 'x' hexdigit* as s - { if mode <> In_block_comment then - error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_string escaped_buf s; - Other + | 'x' hexdigit* + { error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); } - | _ as c - { if mode = New_syntax then - error lexbuf "unknown escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_char escaped_buf c; - Other + | _ + { error lexbuf "unknown escape sequence" ~delta:(-1); } | eof - { if mode <> In_block_comment then - error lexbuf "unterminated escape sequence" ~delta:(-1); - Other + { error lexbuf "unterminated escape sequence" ~delta:(-1); } -and quoted_string_after_escaped_newline mode = parse +and quoted_string_after_escaped_newline = parse | [' ' '\t']* - { quoted_string mode lexbuf } + { quoted_string lexbuf } From cd6d5dba53efd0d5037d8a55affc7b6d52f7fc80 Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 20:00:35 +0630 Subject: [PATCH 4/7] Simplify jbuild lexer and move types around The types should exist in the lexer module where they're used Signed-off-by: Rudi Grinberg --- src/usexp/dune_lexer.mll | 6 +++++- src/usexp/jbuild_lexer.mll | 25 ++++++++++++++++--------- src/usexp/lexer0.ml | 14 -------------- src/usexp/lexer0.mli | 9 --------- 4 files changed, 21 insertions(+), 33 deletions(-) diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll index e6e4dcf0..2c3a10a2 100644 --- a/src/usexp/dune_lexer.mll +++ b/src/usexp/dune_lexer.mll @@ -1,5 +1,9 @@ { - open Lexer0 +open Lexer0 + +type block_string_line_kind = + | With_escape_sequences + | Raw } let comment = ';' [^ '\n' '\r']* diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll index 4a2d546e..4cfade63 100644 --- a/src/usexp/jbuild_lexer.mll +++ b/src/usexp/jbuild_lexer.mll @@ -1,5 +1,14 @@ { open Lexer0 + +(* The difference between the old and new syntax is that the old + syntax allows backslash following by any characters other than 'n', + 'x', ... and interpret it as it. The new syntax is stricter in + order to allow introducing new escape sequence in the future if + needed. *) +type escape_mode = + | In_block_comment (* Inside #|...|# comments (old syntax) *) + | In_quoted_string } let comment = ';' [^ '\n' '\r']* @@ -24,7 +33,7 @@ rule token = parse | '"' { Buffer.clear escaped_buf; let start = Lexing.lexeme_start_p lexbuf in - let s = quoted_string Old_syntax lexbuf in + let s = quoted_string In_quoted_string lexbuf in lexbuf.lex_start_p <- start; Quoted_string s } @@ -77,7 +86,7 @@ and quoted_string mode = parse quoted_string mode lexbuf } | eof - { if mode <> In_block_comment then + { if mode = In_block_comment then error lexbuf "unterminated quoted string"; Buffer.contents escaped_buf } @@ -120,14 +129,14 @@ and escape_sequence mode = parse } | (digit as c1) (digit as c2) (digit as c3) { let v = eval_decimal_escape c1 c2 c3 in - if mode <> In_block_comment && v > 255 then + if mode = In_quoted_string && v > 255 then error lexbuf "escape sequence in quoted string out of range" ~delta:(-1); Buffer.add_char escaped_buf (Char.chr v); Other } | digit* as s - { if mode <> In_block_comment then + { if mode = In_quoted_string then error lexbuf "unterminated decimal escape sequence" ~delta:(-1); Buffer.add_char escaped_buf '\\'; Buffer.add_string escaped_buf s; @@ -139,21 +148,19 @@ and escape_sequence mode = parse Other } | 'x' hexdigit* as s - { if mode <> In_block_comment then + { if mode = In_quoted_string then error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); Buffer.add_char escaped_buf '\\'; Buffer.add_string escaped_buf s; Other } | _ as c - { if mode = New_syntax then - error lexbuf "unknown escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; + { Buffer.add_char escaped_buf '\\'; Buffer.add_char escaped_buf c; Other } | eof - { if mode <> In_block_comment then + { if mode = In_quoted_string then error lexbuf "unterminated escape sequence" ~delta:(-1); Other } diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer0.ml index 5ebc545a..b361acc2 100644 --- a/src/usexp/lexer0.ml +++ b/src/usexp/lexer0.ml @@ -31,24 +31,10 @@ let error ?(delta=0) lexbuf message = let escaped_buf = Buffer.create 256 -(* The difference between the old and new syntax is that the old - syntax allows backslash following by any characters other than 'n', - 'x', ... and interpret it as it. The new syntax is stricter in - order to allow introducing new escape sequence in the future if - needed. *) -type escape_mode = - | In_block_comment (* Inside #|...|# comments (old syntax) *) - | Old_syntax - | New_syntax - type escape_sequence = | Newline | Other -type block_string_line_kind = - | With_escape_sequences - | Raw - let eval_decimal_char c = Char.code c - Char.code '0' let eval_decimal_escape c1 c2 c3 = diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer0.mli index fc15c06a..b442888d 100644 --- a/src/usexp/lexer0.mli +++ b/src/usexp/lexer0.mli @@ -24,19 +24,10 @@ val escaped_buf : Buffer.t exception Error of Error.t -type escape_mode = - | In_block_comment (* Inside #|...|# comments (old syntax) *) - | Old_syntax - | New_syntax - type escape_sequence = | Newline | Other -type block_string_line_kind = - | With_escape_sequences - | Raw - val eval_decimal_char : char -> int val eval_decimal_escape : char -> char -> char -> int From d6778a50a24b8d15efd4b0288d8eb144f4618da1 Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 20:16:12 +0630 Subject: [PATCH 5/7] Rename Lexer0 to Lexer_shared Signed-off-by: Rudi Grinberg --- src/usexp/dune_lexer.mll | 2 +- src/usexp/jbuild_lexer.mll | 2 +- src/usexp/lexer.ml | 2 +- src/usexp/{lexer0.ml => lexer_shared.ml} | 1 - src/usexp/{lexer0.mli => lexer_shared.mli} | 0 5 files changed, 3 insertions(+), 4 deletions(-) rename src/usexp/{lexer0.ml => lexer_shared.ml} (99%) rename src/usexp/{lexer0.mli => lexer_shared.mli} (100%) diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll index 2c3a10a2..4803c4de 100644 --- a/src/usexp/dune_lexer.mll +++ b/src/usexp/dune_lexer.mll @@ -1,5 +1,5 @@ { -open Lexer0 +open Lexer_shared type block_string_line_kind = | With_escape_sequences diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll index 4cfade63..0cc57a5f 100644 --- a/src/usexp/jbuild_lexer.mll +++ b/src/usexp/jbuild_lexer.mll @@ -1,5 +1,5 @@ { - open Lexer0 + open Lexer_shared (* The difference between the old and new syntax is that the old syntax allows backslash following by any characters other than 'n', diff --git a/src/usexp/lexer.ml b/src/usexp/lexer.ml index aff4e6d2..50f62541 100644 --- a/src/usexp/lexer.ml +++ b/src/usexp/lexer.ml @@ -1,4 +1,4 @@ -include Lexer0 +include Lexer_shared let token = Dune_lexer.token let jbuild_token = Jbuild_lexer.token diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer_shared.ml similarity index 99% rename from src/usexp/lexer0.ml rename to src/usexp/lexer_shared.ml index b361acc2..ee1cf5ba 100644 --- a/src/usexp/lexer0.ml +++ b/src/usexp/lexer_shared.ml @@ -1,4 +1,3 @@ - module Token = struct type t = | Atom of Atom.t diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer_shared.mli similarity index 100% rename from src/usexp/lexer0.mli rename to src/usexp/lexer_shared.mli From 15ffc107ff12dbe5fba85ba7a3f70c783ef92aed Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 20:21:23 +0630 Subject: [PATCH 6/7] Add mli's to {dune,jbuild}_lexer Signed-off-by: Rudi Grinberg --- src/usexp/dune_lexer.mli | 1 + src/usexp/jbuild_lexer.mli | 1 + 2 files changed, 2 insertions(+) create mode 100644 src/usexp/dune_lexer.mli create mode 100644 src/usexp/jbuild_lexer.mli diff --git a/src/usexp/dune_lexer.mli b/src/usexp/dune_lexer.mli new file mode 100644 index 00000000..5d1e70f7 --- /dev/null +++ b/src/usexp/dune_lexer.mli @@ -0,0 +1 @@ +val token : Lexer_shared.t diff --git a/src/usexp/jbuild_lexer.mli b/src/usexp/jbuild_lexer.mli new file mode 100644 index 00000000..5d1e70f7 --- /dev/null +++ b/src/usexp/jbuild_lexer.mli @@ -0,0 +1 @@ +val token : Lexer_shared.t From 5a46dc1998a18f145f9b7d0df0ac98400181f5ed Mon Sep 17 00:00:00 2001 From: Rudi Grinberg Date: Wed, 20 Jun 2018 20:21:40 +0630 Subject: [PATCH 7/7] Add jbuild_lexer.boot to speed up bootstrap Signed-off-by: Rudi Grinberg --- src/usexp/jbuild_lexer.boot.ml | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/usexp/jbuild_lexer.boot.ml diff --git a/src/usexp/jbuild_lexer.boot.ml b/src/usexp/jbuild_lexer.boot.ml new file mode 100644 index 00000000..e0671e6d --- /dev/null +++ b/src/usexp/jbuild_lexer.boot.ml @@ -0,0 +1 @@ +let token _ = failwith "Unused during bootstrap"