From 8727eb6c3cbbd0fbb6a1bd47712b115ce21cb60a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20Dimino?= Date: Mon, 11 Jun 2018 16:21:31 +0100 Subject: [PATCH] Strengthen the lexing of escape sequences (#872) Things like \a are no longer allowed. Before they would be interpreted as a literal \a. This will allow to introduce new escape sequences in the future if needed. Signed-off-by: Jeremie Dimino --- src/usexp/lexer.mll | 51 ++++++++++++++++++++++++---------------- test/unit-tests/sexp.mlt | 12 ++++++++++ 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/src/usexp/lexer.mll b/src/usexp/lexer.mll index 8e6a032b..cb5d6343 100644 --- a/src/usexp/lexer.mll +++ b/src/usexp/lexer.mll @@ -33,6 +33,16 @@ let error ?(delta=0) lexbuf message = ; message }) +(* The difference between the old and new syntax is that the old + syntax allows backslash following by any characters other than 'n', + 'x', ... and interpret it as it. The new syntax is stricter in + order to allow introducing new escape sequence in the future if + needed. *) +type escape_mode = + | In_block_comment (* Inside #|...|# comments (old syntax) *) + | Old_syntax + | New_syntax + let eval_decimal_char c = Char.code c - Char.code '0' let eval_decimal_escape c1 c2 c3 = @@ -82,7 +92,7 @@ rule jbuild_token = parse | '"' { Buffer.clear escaped_buf; let start = Lexing.lexeme_start_p lexbuf in - let s = quoted_string true lexbuf in + let s = quoted_string Old_syntax lexbuf in lexbuf.lex_start_p <- start; Quoted_string s } @@ -117,35 +127,34 @@ and jbuild_atom acc start = parse Token.Atom (A acc) } -(* If [strict] is false, ignore errors *) -and quoted_string strict = parse +and quoted_string mode = parse | '"' { Buffer.contents escaped_buf } | '\\' - { match escape_sequence strict lexbuf with - | Newline -> quoted_string_after_escaped_newline strict lexbuf - | Other -> quoted_string strict lexbuf + { match escape_sequence mode lexbuf with + | Newline -> quoted_string_after_escaped_newline mode lexbuf + | Other -> quoted_string mode lexbuf } | newline as s { Lexing.new_line lexbuf; Buffer.add_string escaped_buf s; - quoted_string strict lexbuf + quoted_string mode lexbuf } | _ as c { Buffer.add_char escaped_buf c; - quoted_string strict lexbuf + quoted_string mode lexbuf } | eof - { if strict then + { if mode <> In_block_comment then error lexbuf "unterminated quoted string"; Buffer.contents escaped_buf } -and quoted_string_after_escaped_newline strict = parse +and quoted_string_after_escaped_newline mode = parse | [' ' '\t']* - { quoted_string strict lexbuf } + { quoted_string mode lexbuf } -and escape_sequence strict = parse +and escape_sequence mode = parse | newline { Lexing.new_line lexbuf; Newline } @@ -163,14 +172,14 @@ and escape_sequence strict = parse } | (digit as c1) (digit as c2) (digit as c3) { let v = eval_decimal_escape c1 c2 c3 in - if strict && v > 255 then + if mode <> In_block_comment && v > 255 then error lexbuf "escape sequence in quoted string out of range" ~delta:(-1); Buffer.add_char escaped_buf (Char.chr v); Other } | digit* as s - { if strict then + { if mode <> In_block_comment then error lexbuf "unterminated decimal escape sequence" ~delta:(-1); Buffer.add_char escaped_buf '\\'; Buffer.add_string escaped_buf s; @@ -182,19 +191,21 @@ and escape_sequence strict = parse Other } | 'x' hexdigit* as s - { if strict then + { if mode <> In_block_comment then error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); Buffer.add_char escaped_buf '\\'; Buffer.add_string escaped_buf s; Other } | _ as c - { Buffer.add_char escaped_buf '\\'; + { if mode = New_syntax then + error lexbuf "unknown escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; Buffer.add_char escaped_buf c; Other } | eof - { if strict then + { if mode <> In_block_comment then error lexbuf "unterminated escape sequence" ~delta:(-1); Other } @@ -202,7 +213,7 @@ and escape_sequence strict = parse and jbuild_block_comment = parse | '"' { Buffer.clear escaped_buf; - ignore (quoted_string false lexbuf : string); + ignore (quoted_string In_block_comment lexbuf : string); jbuild_block_comment lexbuf } | "|#" @@ -243,7 +254,7 @@ and dune_quoted_string = parse | "\\>" { block_string_start Raw lexbuf } | "" - { quoted_string true lexbuf } + { quoted_string New_syntax lexbuf } and block_string_start kind = parse | newline as s @@ -270,7 +281,7 @@ and block_string = parse block_string_after_newline lexbuf } | '\\' - { match escape_sequence true lexbuf with + { match escape_sequence New_syntax lexbuf with | Newline -> block_string_after_newline lexbuf | Other -> block_string lexbuf } diff --git a/test/unit-tests/sexp.mlt b/test/unit-tests/sexp.mlt index 4ecf751a..2fb9aeaf 100644 --- a/test/unit-tests/sexp.mlt +++ b/test/unit-tests/sexp.mlt @@ -95,3 +95,15 @@ parse {|x|#y|} - : parse_result = Different {jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [x|#y]} |}] + +parse {|"\a"|} +[%%expect{| +- : parse_result = +Different {jbuild = Ok ["\\a"]; dune = Error "unknown escape sequence"} +|}] + +parse {|"\%{x}"|} +[%%expect{| +- : parse_result = +Different {jbuild = Ok ["\\%{x}"]; dune = Error "unknown escape sequence"} +|}]