Strengthen the lexing of escape sequences (#872)

Things like \a are no longer allowed. Before they would be interpreted as a literal \a. This will allow to introduce new escape sequences in the future if needed. Signed-off-by: Jeremie Dimino <jeremie@dimino.org>
2018-06-11 16:21:31 +01:00 · 2018-06-11 16:21:31 +01:00 · 8727eb6c3c
parent 58a47e4ff8
commit 8727eb6c3c
2 changed files with 43 additions and 20 deletions
--- a/src/usexp/lexer.mll
+++ b/src/usexp/lexer.mll
@ -33,6 +33,16 @@ let error ?(delta=0) lexbuf message =
           ; message
           })
 (* The difference between the old and new syntax is that the old
   syntax allows backslash following by any characters other than 'n',
   'x', ... and interpret it as it. The new syntax is stricter in
   order to allow introducing new escape sequence in the future if
   needed. *)
 type escape_mode =
  | In_block_comment (* Inside #|...|# comments (old syntax) *)
  | Old_syntax
  | New_syntax
 let eval_decimal_char c = Char.code c - Char.code '0'
 let eval_decimal_escape c1 c2 c3 =
@ -82,7 +92,7 @@ rule jbuild_token = parse
  | '"'
    { Buffer.clear escaped_buf;
      let start = Lexing.lexeme_start_p lexbuf in
-      let s = quoted_string true lexbuf in
+      let s = quoted_string Old_syntax lexbuf in
      lexbuf.lex_start_p <- start;
      Quoted_string s
    }
@ -117,35 +127,34 @@ and jbuild_atom acc start = parse
      Token.Atom (A acc)
    }
-(* If [strict] is false, ignore errors *)
+and quoted_string mode = parse
 and quoted_string strict = parse
  | '"'
    { Buffer.contents escaped_buf }
  | '\\'
-    { match escape_sequence strict lexbuf with
+    { match escape_sequence mode lexbuf with
-      | Newline -> quoted_string_after_escaped_newline strict lexbuf
+      | Newline -> quoted_string_after_escaped_newline mode lexbuf
-      | Other   -> quoted_string                       strict lexbuf
+      | Other   -> quoted_string                       mode lexbuf
    }
  | newline as s
    { Lexing.new_line lexbuf;
      Buffer.add_string escaped_buf s;
-      quoted_string strict lexbuf
+      quoted_string mode lexbuf
    }
  | _ as c
    { Buffer.add_char escaped_buf c;
-      quoted_string strict lexbuf
+      quoted_string mode lexbuf
    }
  | eof
-    { if strict then
+    { if mode <> In_block_comment then
        error lexbuf "unterminated quoted string";
      Buffer.contents escaped_buf
    }
-and quoted_string_after_escaped_newline strict = parse
+and quoted_string_after_escaped_newline mode = parse
  | [' ' '\t']*
-    { quoted_string strict lexbuf }
+    { quoted_string mode lexbuf }
-and escape_sequence strict = parse
+and escape_sequence mode = parse
  | newline
    { Lexing.new_line lexbuf;
      Newline }
@ -163,14 +172,14 @@ and escape_sequence strict = parse
    }
  | (digit as c1) (digit as c2) (digit as c3)
    { let v = eval_decimal_escape c1 c2 c3 in
-      if strict && v > 255 then
+      if mode <> In_block_comment && v > 255 then
        error lexbuf "escape sequence in quoted string out of range"
          ~delta:(-1);
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | digit* as s
-    { if strict then
+    { if mode <> In_block_comment then
        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
@ -182,19 +191,21 @@ and escape_sequence strict = parse
      Other
    }
  | 'x' hexdigit* as s
-    { if strict then
+    { if mode <> In_block_comment then
        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | _ as c
-    { Buffer.add_char escaped_buf '\\';
+    { if mode = New_syntax then
        error lexbuf "unknown escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_char escaped_buf c;
      Other
    }
  | eof
-    { if strict then
+    { if mode <> In_block_comment then
        error lexbuf "unterminated escape sequence" ~delta:(-1);
      Other
    }
@ -202,7 +213,7 @@ and escape_sequence strict = parse
 and jbuild_block_comment = parse
  | '"'
    { Buffer.clear escaped_buf;
-      ignore (quoted_string false lexbuf : string);
+      ignore (quoted_string In_block_comment lexbuf : string);
      jbuild_block_comment lexbuf
    }
  | "|#"
@ -243,7 +254,7 @@ and dune_quoted_string = parse
  | "\\>"
    { block_string_start Raw lexbuf }
  | ""
-    { quoted_string true lexbuf }
+    { quoted_string New_syntax lexbuf }
 and block_string_start kind = parse
  | newline as s
@ -270,7 +281,7 @@ and block_string = parse
      block_string_after_newline lexbuf
    }
  | '\\'
-    { match escape_sequence true lexbuf with
+    { match escape_sequence New_syntax lexbuf with
      | Newline -> block_string_after_newline lexbuf
      | Other   -> block_string               lexbuf
    }
--- a/test/unit-tests/sexp.mlt
+++ b/test/unit-tests/sexp.mlt
@ -95,3 +95,15 @@ parse {|x|#y|}
 - : parse_result =
 Different {jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [x|#y]}
 |}]
 parse {|"\a"|}
 [%%expect{|
 - : parse_result =
 Different {jbuild = Ok ["\\a"]; dune = Error "unknown escape sequence"}
 |}]
 parse {|"\%{x}"|}
 [%%expect{|
 - : parse_result =
 Different {jbuild = Ok ["\\%{x}"]; dune = Error "unknown escape sequence"}
 |}]