From e3aa13424d612ceb616c2323dded69434f8a271a Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 19:29:16 +0630
Subject: [PATCH 1/7] Split dune and jbuild lexers

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/dune                          |   2 +-
 src/usexp/{lexer.mll => dune_lexer.mll} | 302 +++++++-----------------
 src/usexp/jbuild_lexer.mll              | 159 +++++++++++++
 src/usexp/lexer.ml                      |   4 +
 src/usexp/lexer0.ml                     |  68 ++++++
 src/usexp/lexer0.mli                    |  44 ++++
 6 files changed, 361 insertions(+), 218 deletions(-)
 rename src/usexp/{lexer.mll => dune_lexer.mll} (59%)
 create mode 100644 src/usexp/jbuild_lexer.mll
 create mode 100644 src/usexp/lexer.ml
 create mode 100644 src/usexp/lexer0.ml
 create mode 100644 src/usexp/lexer0.mli

diff --git a/src/usexp/dune b/src/usexp/dune
index a26d42f0..bd71f71a 100644
--- a/src/usexp/dune
+++ b/src/usexp/dune
@@ -3,4 +3,4 @@
   (synopsis "[Internal] S-expression library")
   (public_name dune.usexp)))
 
-(ocamllex (lexer))
+(ocamllex (dune_lexer jbuild_lexer))
diff --git a/src/usexp/lexer.mll b/src/usexp/dune_lexer.mll
similarity index 59%
rename from src/usexp/lexer.mll
rename to src/usexp/dune_lexer.mll
index 2c9098b0..3bf74724 100644
--- a/src/usexp/lexer.mll
+++ b/src/usexp/dune_lexer.mll
@@ -1,71 +1,5 @@
 {
-module Token = struct
-  type t =
-    | Atom          of Atom.t
-    | Quoted_string of string
-    | Lparen
-    | Rparen
-    | Sexp_comment
-    | Eof
-end
-
-type t = Lexing.lexbuf -> Token.t
-
-module Error = struct
-  type t =
-    { start   : Lexing.position
-    ; stop    : Lexing.position
-    ; message : string
-    }
-end
-
-exception Error of Error.t
-
-let error ?(delta=0) lexbuf message =
-  let start = Lexing.lexeme_start_p lexbuf in
-  raise
-    (Error { start = { start with pos_cnum = start.pos_cnum + delta }
-           ; stop  = Lexing.lexeme_end_p   lexbuf
-           ; message
-           })
-
-(* The difference between the old and new syntax is that the old
-   syntax allows backslash following by any characters other than 'n',
-   'x', ... and interpret it as it. The new syntax is stricter in
-   order to allow introducing new escape sequence in the future if
-   needed. *)
-type escape_mode =
-  | In_block_comment (* Inside #|...|# comments (old syntax) *)
-  | Old_syntax
-  | New_syntax
-
-let eval_decimal_char c = Char.code c - Char.code '0'
-
-let eval_decimal_escape c1 c2 c3 =
-  (eval_decimal_char c1) * 100 +
-  (eval_decimal_char c2) * 10  +
-  (eval_decimal_char c3)
-
-let eval_hex_char c =
-  match c with
-  | '0'..'9' -> Char.code c - Char.code '0'
-  | 'a'..'f' -> Char.code c - Char.code 'a' + 10
-  | 'A'..'F' -> Char.code c - Char.code 'A' + 10
-  | _ -> -1
-
-let eval_hex_escape c1 c2 =
-  (eval_hex_char c1) * 16 +
-  (eval_hex_char c2)
-
-type escape_sequence =
-  | Newline
-  | Other
-
-let escaped_buf = Buffer.create 256
-
-type block_string_line_kind =
-  | With_escape_sequences
-  | Raw
+  open Lexer0
 }
 
 let comment   = ';' [^ '\n' '\r']*
@@ -74,160 +8,11 @@ let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 
-let atom_char_jbuild =
-  [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
 let atom_char_dune =
   [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
 
-(* rule for jbuild files *)
-rule jbuild_token = parse
-  | newline
-    { Lexing.new_line lexbuf; jbuild_token lexbuf }
-  | blank+ | comment
-    { jbuild_token lexbuf }
-  | '('
-    { Token.Lparen }
-  | ')'
-    { Rparen }
-  | '"'
-    { Buffer.clear escaped_buf;
-      let start = Lexing.lexeme_start_p lexbuf in
-      let s = quoted_string Old_syntax lexbuf in
-      lexbuf.lex_start_p <- start;
-      Quoted_string s
-    }
-  | "#|"
-    { jbuild_block_comment lexbuf;
-      jbuild_token lexbuf
-    }
-  | "#;"
-    { Sexp_comment }
-  | eof
-    { Eof }
-  | ""
-    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
-
-and jbuild_atom acc start = parse
-  | '#'+ '|'
-    { lexbuf.lex_start_p <- start;
-      error lexbuf "jbuild_atoms cannot contain #|"
-    }
-  | '|'+ '#'
-    { lexbuf.lex_start_p <- start;
-      error lexbuf "jbuild_atoms cannot contain |#"
-    }
-  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
-    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
-    }
-  | ""
-    { if acc = "" then
-        error lexbuf "Internal error in the S-expression parser, \
-                      please report upstream.";
-      lexbuf.lex_start_p <- start;
-      Token.Atom (Atom.of_string acc)
-    }
-
-and quoted_string mode = parse
-  | '"'
-    { Buffer.contents escaped_buf }
-  | '\\'
-    { match escape_sequence mode lexbuf with
-      | Newline -> quoted_string_after_escaped_newline mode lexbuf
-      | Other   -> quoted_string                       mode lexbuf
-    }
-  | newline as s
-    { Lexing.new_line lexbuf;
-      Buffer.add_string escaped_buf s;
-      quoted_string mode lexbuf
-    }
-  | _ as c
-    { Buffer.add_char escaped_buf c;
-      quoted_string mode lexbuf
-    }
-  | eof
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated quoted string";
-      Buffer.contents escaped_buf
-    }
-
-and quoted_string_after_escaped_newline mode = parse
-  | [' ' '\t']*
-    { quoted_string mode lexbuf }
-
-and escape_sequence mode = parse
-  | newline
-    { Lexing.new_line lexbuf;
-      Newline }
-  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
-    { let c =
-        match c with
-        | 'n' -> '\n'
-        | 'r' -> '\r'
-        | 'b' -> '\b'
-        | 't' -> '\t'
-        | _   -> c
-      in
-      Buffer.add_char escaped_buf c;
-      Other
-    }
-  | (digit as c1) (digit as c2) (digit as c3)
-    { let v = eval_decimal_escape c1 c2 c3 in
-      if mode <> In_block_comment && v > 255 then
-        error lexbuf "escape sequence in quoted string out of range"
-          ~delta:(-1);
-      Buffer.add_char escaped_buf (Char.chr v);
-      Other
-    }
-  | digit* as s
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_string escaped_buf s;
-      Other
-    }
-  | 'x' (hexdigit as c1) (hexdigit as c2)
-    { let v = eval_hex_escape c1 c2 in
-      Buffer.add_char escaped_buf (Char.chr v);
-      Other
-    }
-  | 'x' hexdigit* as s
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_string escaped_buf s;
-      Other
-    }
-  | _ as c
-    { if mode = New_syntax then
-        error lexbuf "unknown escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_char escaped_buf c;
-      Other
-    }
-  | eof
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated escape sequence" ~delta:(-1);
-      Other
-    }
-
-and jbuild_block_comment = parse
-  | '"'
-    { Buffer.clear escaped_buf;
-      ignore (quoted_string In_block_comment lexbuf : string);
-      jbuild_block_comment lexbuf
-    }
-  | "|#"
-    { ()
-    }
-  | eof
-    { error lexbuf "unterminated block comment"
-    }
-  | _
-    { jbuild_block_comment lexbuf
-    }
-
 (* rule for dune files *)
-and token = parse
+rule token = parse
   | newline
     { Lexing.new_line lexbuf; token lexbuf }
   | blank+ | comment
@@ -316,3 +101,86 @@ and raw_block_string = parse
   | eof
     { Buffer.contents escaped_buf
     }
+
+and quoted_string mode = parse
+  | '"'
+    { Buffer.contents escaped_buf }
+  | '\\'
+    { match escape_sequence mode lexbuf with
+      | Newline -> quoted_string_after_escaped_newline mode lexbuf
+      | Other   -> quoted_string                       mode lexbuf
+    }
+  | newline as s
+    { Lexing.new_line lexbuf;
+      Buffer.add_string escaped_buf s;
+      quoted_string mode lexbuf
+    }
+  | _ as c
+    { Buffer.add_char escaped_buf c;
+      quoted_string mode lexbuf
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated quoted string";
+      Buffer.contents escaped_buf
+    }
+
+and escape_sequence mode = parse
+  | newline
+    { Lexing.new_line lexbuf;
+      Newline }
+  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
+    { let c =
+        match c with
+        | 'n' -> '\n'
+        | 'r' -> '\r'
+        | 'b' -> '\b'
+        | 't' -> '\t'
+        | _   -> c
+      in
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | (digit as c1) (digit as c2) (digit as c3)
+    { let v = eval_decimal_escape c1 c2 c3 in
+      if mode <> In_block_comment && v > 255 then
+        error lexbuf "escape sequence in quoted string out of range"
+          ~delta:(-1);
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | digit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | 'x' (hexdigit as c1) (hexdigit as c2)
+    { let v = eval_hex_escape c1 c2 in
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | 'x' hexdigit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | _ as c
+    { if mode = New_syntax then
+        error lexbuf "unknown escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated escape sequence" ~delta:(-1);
+      Other
+    }
+
+and quoted_string_after_escaped_newline mode = parse
+  | [' ' '\t']*
+    { quoted_string mode lexbuf }
diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll
new file mode 100644
index 00000000..51376bd9
--- /dev/null
+++ b/src/usexp/jbuild_lexer.mll
@@ -0,0 +1,159 @@
+{
+  open Lexer0
+}
+
+let comment   = ';' [^ '\n' '\r']*
+let newline   = '\r'? '\n'
+let blank     = [' ' '\t' '\012']
+let digit     = ['0'-'9']
+let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
+
+let atom_char_jbuild =
+  [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
+
+(* rule for jbuild files *)
+rule token = parse
+  | newline
+    { Lexing.new_line lexbuf; token lexbuf }
+  | blank+ | comment
+    { token lexbuf }
+  | '('
+    { Token.Lparen }
+  | ')'
+    { Rparen }
+  | '"'
+    { Buffer.clear escaped_buf;
+      let start = Lexing.lexeme_start_p lexbuf in
+      let s = quoted_string Old_syntax lexbuf in
+      lexbuf.lex_start_p <- start;
+      Quoted_string s
+    }
+  | "#|"
+    { jbuild_block_comment lexbuf;
+      token lexbuf
+    }
+  | "#;"
+    { Sexp_comment }
+  | eof
+    { Eof }
+  | ""
+    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
+
+and jbuild_atom acc start = parse
+  | '#'+ '|'
+    { lexbuf.lex_start_p <- start;
+      error lexbuf "jbuild_atoms cannot contain #|"
+    }
+  | '|'+ '#'
+    { lexbuf.lex_start_p <- start;
+      error lexbuf "jbuild_atoms cannot contain |#"
+    }
+  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
+    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
+    }
+  | ""
+    { if acc = "" then
+        error lexbuf "Internal error in the S-expression parser, \
+                      please report upstream.";
+      lexbuf.lex_start_p <- start;
+      Token.Atom (Atom.of_string acc)
+    }
+
+and quoted_string mode = parse
+  | '"'
+    { Buffer.contents escaped_buf }
+  | '\\'
+    { match escape_sequence mode lexbuf with
+      | Newline -> quoted_string_after_escaped_newline mode lexbuf
+      | Other   -> quoted_string                       mode lexbuf
+    }
+  | newline as s
+    { Lexing.new_line lexbuf;
+      Buffer.add_string escaped_buf s;
+      quoted_string mode lexbuf
+    }
+  | _ as c
+    { Buffer.add_char escaped_buf c;
+      quoted_string mode lexbuf
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated quoted string";
+      Buffer.contents escaped_buf
+    }
+
+and quoted_string_after_escaped_newline mode = parse
+  | [' ' '\t']*
+    { quoted_string mode lexbuf }
+
+and jbuild_block_comment = parse
+  | '"'
+    { Buffer.clear escaped_buf;
+      ignore (quoted_string In_block_comment lexbuf : string);
+      jbuild_block_comment lexbuf
+    }
+  | "|#"
+    { ()
+    }
+  | eof
+    { error lexbuf "unterminated block comment"
+    }
+  | _
+    { jbuild_block_comment lexbuf
+    }
+
+and escape_sequence mode = parse
+  | newline
+    { Lexing.new_line lexbuf;
+      Newline }
+  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
+    { let c =
+        match c with
+        | 'n' -> '\n'
+        | 'r' -> '\r'
+        | 'b' -> '\b'
+        | 't' -> '\t'
+        | _   -> c
+      in
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | (digit as c1) (digit as c2) (digit as c3)
+    { let v = eval_decimal_escape c1 c2 c3 in
+      if mode <> In_block_comment && v > 255 then
+        error lexbuf "escape sequence in quoted string out of range"
+          ~delta:(-1);
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | digit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | 'x' (hexdigit as c1) (hexdigit as c2)
+    { let v = eval_hex_escape c1 c2 in
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | 'x' hexdigit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | _ as c
+    { if mode = New_syntax then
+        error lexbuf "unknown escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated escape sequence" ~delta:(-1);
+      Other
+    }
diff --git a/src/usexp/lexer.ml b/src/usexp/lexer.ml
new file mode 100644
index 00000000..aff4e6d2
--- /dev/null
+++ b/src/usexp/lexer.ml
@@ -0,0 +1,4 @@
+include Lexer0
+
+let token = Dune_lexer.token
+let jbuild_token = Jbuild_lexer.token
diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer0.ml
new file mode 100644
index 00000000..5ebc545a
--- /dev/null
+++ b/src/usexp/lexer0.ml
@@ -0,0 +1,68 @@
+
+module Token = struct
+  type t =
+    | Atom          of Atom.t
+    | Quoted_string of string
+    | Lparen
+    | Rparen
+    | Sexp_comment
+    | Eof
+end
+
+type t = Lexing.lexbuf -> Token.t
+
+module Error = struct
+  type t =
+    { start   : Lexing.position
+    ; stop    : Lexing.position
+    ; message : string
+    }
+end
+
+exception Error of Error.t
+
+let error ?(delta=0) lexbuf message =
+  let start = Lexing.lexeme_start_p lexbuf in
+  raise
+    (Error { start = { start with pos_cnum = start.pos_cnum + delta }
+           ; stop  = Lexing.lexeme_end_p   lexbuf
+           ; message
+           })
+
+let escaped_buf = Buffer.create 256
+
+(* The difference between the old and new syntax is that the old
+   syntax allows backslash following by any characters other than 'n',
+   'x', ... and interpret it as it. The new syntax is stricter in
+   order to allow introducing new escape sequence in the future if
+   needed. *)
+type escape_mode =
+  | In_block_comment (* Inside #|...|# comments (old syntax) *)
+  | Old_syntax
+  | New_syntax
+
+type escape_sequence =
+  | Newline
+  | Other
+
+type block_string_line_kind =
+  | With_escape_sequences
+  | Raw
+
+let eval_decimal_char c = Char.code c - Char.code '0'
+
+let eval_decimal_escape c1 c2 c3 =
+  (eval_decimal_char c1) * 100 +
+  (eval_decimal_char c2) * 10  +
+  (eval_decimal_char c3)
+
+let eval_hex_char c =
+  match c with
+  | '0'..'9' -> Char.code c - Char.code '0'
+  | 'a'..'f' -> Char.code c - Char.code 'a' + 10
+  | 'A'..'F' -> Char.code c - Char.code 'A' + 10
+  | _ -> -1
+
+let eval_hex_escape c1 c2 =
+  (eval_hex_char c1) * 16 +
+  (eval_hex_char c2)
diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer0.mli
new file mode 100644
index 00000000..fc15c06a
--- /dev/null
+++ b/src/usexp/lexer0.mli
@@ -0,0 +1,44 @@
+module Token : sig
+  type t =
+    | Atom          of Atom.t
+    | Quoted_string of string
+    | Lparen
+    | Rparen
+    | Sexp_comment
+    | Eof
+end
+
+type t = Lexing.lexbuf -> Token.t
+
+module Error : sig
+  type t =
+    { start   : Lexing.position
+    ; stop    : Lexing.position
+    ; message : string
+    }
+end
+
+val error : ?delta:int -> Lexing.lexbuf -> string -> 'a
+
+val escaped_buf : Buffer.t
+
+exception Error of Error.t
+
+type escape_mode =
+  | In_block_comment (* Inside #|...|# comments (old syntax) *)
+  | Old_syntax
+  | New_syntax
+
+type escape_sequence =
+  | Newline
+  | Other
+
+type block_string_line_kind =
+  | With_escape_sequences
+  | Raw
+
+val eval_decimal_char : char -> int
+
+val eval_decimal_escape : char -> char -> char -> int
+
+val eval_hex_escape : char -> char -> int

From d7ab3d962c9aedbd0f79e919c05e3c65cf6404f5 Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 19:37:08 +0630
Subject: [PATCH 2/7] Harmonize names in dune and jbuild lexers

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/dune_lexer.mll   |  9 ++++-----
 src/usexp/jbuild_lexer.mll | 22 +++++++++++-----------
 test/unit-tests/sexp.mlt   |  4 ++--
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll
index 3bf74724..2e43b3da 100644
--- a/src/usexp/dune_lexer.mll
+++ b/src/usexp/dune_lexer.mll
@@ -8,10 +8,9 @@ let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 
-let atom_char_dune =
+let atom_char =
   [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
 
-(* rule for dune files *)
 rule token = parse
   | newline
     { Lexing.new_line lexbuf; token lexbuf }
@@ -24,17 +23,17 @@ rule token = parse
   | '"'
     { Buffer.clear escaped_buf;
       let start = Lexing.lexeme_start_p lexbuf in
-      let s = dune_quoted_string lexbuf in
+      let s = start_quoted_string lexbuf in
       lexbuf.lex_start_p <- start;
       Quoted_string s
     }
-  | atom_char_dune+ as s
+  | atom_char+ as s
     { Token.Atom (Atom.of_string s) }
   | _ as c { error lexbuf (Printf.sprintf "Invalid atom character '%c'" c) }
   | eof
     { Eof }
 
-and dune_quoted_string = parse
+and start_quoted_string = parse
   | "\\|"
     { block_string_start With_escape_sequences lexbuf }
   | "\\>"
diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll
index 51376bd9..4a2d546e 100644
--- a/src/usexp/jbuild_lexer.mll
+++ b/src/usexp/jbuild_lexer.mll
@@ -8,7 +8,7 @@ let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 
-let atom_char_jbuild =
+let atom_char =
   [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
 
 (* rule for jbuild files *)
@@ -29,7 +29,7 @@ rule token = parse
       Quoted_string s
     }
   | "#|"
-    { jbuild_block_comment lexbuf;
+    { block_comment lexbuf;
       token lexbuf
     }
   | "#;"
@@ -37,19 +37,19 @@ rule token = parse
   | eof
     { Eof }
   | ""
-    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
+    { atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
 
-and jbuild_atom acc start = parse
+and atom acc start = parse
   | '#'+ '|'
     { lexbuf.lex_start_p <- start;
-      error lexbuf "jbuild_atoms cannot contain #|"
+      error lexbuf "jbuild atoms cannot contain #|"
     }
   | '|'+ '#'
     { lexbuf.lex_start_p <- start;
-      error lexbuf "jbuild_atoms cannot contain |#"
+      error lexbuf "jbuild atoms cannot contain |#"
     }
-  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
-    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
+  | ('#'+ | '|'+ | (atom_char # ['|' '#'])) as s
+    { atom (if acc = "" then s else acc ^ s) start lexbuf
     }
   | ""
     { if acc = "" then
@@ -86,11 +86,11 @@ and quoted_string_after_escaped_newline mode = parse
   | [' ' '\t']*
     { quoted_string mode lexbuf }
 
-and jbuild_block_comment = parse
+and block_comment = parse
   | '"'
     { Buffer.clear escaped_buf;
       ignore (quoted_string In_block_comment lexbuf : string);
-      jbuild_block_comment lexbuf
+      block_comment lexbuf
     }
   | "|#"
     { ()
@@ -99,7 +99,7 @@ and jbuild_block_comment = parse
     { error lexbuf "unterminated block comment"
     }
   | _
-    { jbuild_block_comment lexbuf
+    { block_comment lexbuf
     }
 
 and escape_sequence mode = parse
diff --git a/test/unit-tests/sexp.mlt b/test/unit-tests/sexp.mlt
index 79db5495..cc42cb6d 100644
--- a/test/unit-tests/sexp.mlt
+++ b/test/unit-tests/sexp.mlt
@@ -107,14 +107,14 @@ parse {|x#|y|}
 [%%expect{|
 - : parse_result =
 Different
- {jbuild = Error "jbuild_atoms cannot contain #|"; dune = Ok [(atom x#|y)]}
+ {jbuild = Error "jbuild atoms cannot contain #|"; dune = Ok [(atom x#|y)]}
 |}]
 
 parse {|x|#y|}
 [%%expect{|
 - : parse_result =
 Different
- {jbuild = Error "jbuild_atoms cannot contain |#"; dune = Ok [(atom x|#y)]}
+ {jbuild = Error "jbuild atoms cannot contain |#"; dune = Ok [(atom x|#y)]}
 |}]
 
 parse {|"\a"|}

From 1e5dc322e2b7a32bb6ee2c03c1c9c857ea4b4973 Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 19:50:58 +0630
Subject: [PATCH 3/7] Remove some legacy stuff from the new dune lexer

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/dune_lexer.mll | 62 ++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 38 deletions(-)

diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll
index 2e43b3da..e6e4dcf0 100644
--- a/src/usexp/dune_lexer.mll
+++ b/src/usexp/dune_lexer.mll
@@ -8,8 +8,7 @@ let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 
-let atom_char =
-  [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
+let atom_char = [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
 
 rule token = parse
   | newline
@@ -39,7 +38,7 @@ and start_quoted_string = parse
   | "\\>"
     { block_string_start Raw lexbuf }
   | ""
-    { quoted_string New_syntax lexbuf }
+    { quoted_string lexbuf }
 
 and block_string_start kind = parse
   | newline as s
@@ -66,7 +65,7 @@ and block_string = parse
       block_string_after_newline lexbuf
     }
   | '\\'
-    { match escape_sequence New_syntax lexbuf with
+    { match escape_sequence lexbuf with
       | Newline -> block_string_after_newline lexbuf
       | Other   -> block_string               lexbuf
     }
@@ -101,30 +100,28 @@ and raw_block_string = parse
     { Buffer.contents escaped_buf
     }
 
-and quoted_string mode = parse
+and quoted_string = parse
   | '"'
     { Buffer.contents escaped_buf }
   | '\\'
-    { match escape_sequence mode lexbuf with
-      | Newline -> quoted_string_after_escaped_newline mode lexbuf
-      | Other   -> quoted_string                       mode lexbuf
+    { match escape_sequence lexbuf with
+      | Newline -> quoted_string_after_escaped_newline lexbuf
+      | Other   -> quoted_string                       lexbuf
     }
   | newline as s
     { Lexing.new_line lexbuf;
       Buffer.add_string escaped_buf s;
-      quoted_string mode lexbuf
+      quoted_string lexbuf
     }
   | _ as c
     { Buffer.add_char escaped_buf c;
-      quoted_string mode lexbuf
+      quoted_string lexbuf
     }
   | eof
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated quoted string";
-      Buffer.contents escaped_buf
+    { error lexbuf "unterminated quoted string"
     }
 
-and escape_sequence mode = parse
+and escape_sequence = parse
   | newline
     { Lexing.new_line lexbuf;
       Newline }
@@ -142,44 +139,33 @@ and escape_sequence mode = parse
     }
   | (digit as c1) (digit as c2) (digit as c3)
     { let v = eval_decimal_escape c1 c2 c3 in
-      if mode <> In_block_comment && v > 255 then
+      if v > 255 then
         error lexbuf "escape sequence in quoted string out of range"
           ~delta:(-1);
       Buffer.add_char escaped_buf (Char.chr v);
       Other
     }
-  | digit* as s
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_string escaped_buf s;
-      Other
+  | digit digit digit
+    { error lexbuf "escape sequence in quoted string out of range" ~delta:(-1);
+    }
+  | digit*
+    { error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
     }
   | 'x' (hexdigit as c1) (hexdigit as c2)
     { let v = eval_hex_escape c1 c2 in
       Buffer.add_char escaped_buf (Char.chr v);
       Other
     }
-  | 'x' hexdigit* as s
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_string escaped_buf s;
-      Other
+  | 'x' hexdigit*
+    { error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
     }
-  | _ as c
-    { if mode = New_syntax then
-        error lexbuf "unknown escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_char escaped_buf c;
-      Other
+  | _
+    { error lexbuf "unknown escape sequence" ~delta:(-1);
     }
   | eof
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated escape sequence" ~delta:(-1);
-      Other
+    { error lexbuf "unterminated escape sequence" ~delta:(-1);
     }
 
-and quoted_string_after_escaped_newline mode = parse
+and quoted_string_after_escaped_newline = parse
   | [' ' '\t']*
-    { quoted_string mode lexbuf }
+    { quoted_string lexbuf }

From cd6d5dba53efd0d5037d8a55affc7b6d52f7fc80 Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 20:00:35 +0630
Subject: [PATCH 4/7] Simplify jbuild lexer and move types around

The types should exist in the lexer module where they're used

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/dune_lexer.mll   |  6 +++++-
 src/usexp/jbuild_lexer.mll | 25 ++++++++++++++++---------
 src/usexp/lexer0.ml        | 14 --------------
 src/usexp/lexer0.mli       |  9 ---------
 4 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll
index e6e4dcf0..2c3a10a2 100644
--- a/src/usexp/dune_lexer.mll
+++ b/src/usexp/dune_lexer.mll
@@ -1,5 +1,9 @@
 {
-  open Lexer0
+open Lexer0
+
+type block_string_line_kind =
+  | With_escape_sequences
+  | Raw
 }
 
 let comment   = ';' [^ '\n' '\r']*
diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll
index 4a2d546e..4cfade63 100644
--- a/src/usexp/jbuild_lexer.mll
+++ b/src/usexp/jbuild_lexer.mll
@@ -1,5 +1,14 @@
 {
   open Lexer0
+
+(* The difference between the old and new syntax is that the old
+   syntax allows backslash following by any characters other than 'n',
+   'x', ... and interpret it as it. The new syntax is stricter in
+   order to allow introducing new escape sequence in the future if
+   needed. *)
+type escape_mode =
+  | In_block_comment (* Inside #|...|# comments (old syntax) *)
+  | In_quoted_string
 }
 
 let comment   = ';' [^ '\n' '\r']*
@@ -24,7 +33,7 @@ rule token = parse
   | '"'
     { Buffer.clear escaped_buf;
       let start = Lexing.lexeme_start_p lexbuf in
-      let s = quoted_string Old_syntax lexbuf in
+      let s = quoted_string In_quoted_string lexbuf in
       lexbuf.lex_start_p <- start;
       Quoted_string s
     }
@@ -77,7 +86,7 @@ and quoted_string mode = parse
       quoted_string mode lexbuf
     }
   | eof
-    { if mode <> In_block_comment then
+    { if mode = In_block_comment then
         error lexbuf "unterminated quoted string";
       Buffer.contents escaped_buf
     }
@@ -120,14 +129,14 @@ and escape_sequence mode = parse
     }
   | (digit as c1) (digit as c2) (digit as c3)
     { let v = eval_decimal_escape c1 c2 c3 in
-      if mode <> In_block_comment && v > 255 then
+      if mode = In_quoted_string && v > 255 then
         error lexbuf "escape sequence in quoted string out of range"
           ~delta:(-1);
       Buffer.add_char escaped_buf (Char.chr v);
       Other
     }
   | digit* as s
-    { if mode <> In_block_comment then
+    { if mode = In_quoted_string then
         error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
       Buffer.add_char escaped_buf '\\';
       Buffer.add_string escaped_buf s;
@@ -139,21 +148,19 @@ and escape_sequence mode = parse
       Other
     }
   | 'x' hexdigit* as s
-    { if mode <> In_block_comment then
+    { if mode = In_quoted_string then
         error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
       Buffer.add_char escaped_buf '\\';
       Buffer.add_string escaped_buf s;
       Other
     }
   | _ as c
-    { if mode = New_syntax then
-        error lexbuf "unknown escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
+    { Buffer.add_char escaped_buf '\\';
       Buffer.add_char escaped_buf c;
       Other
     }
   | eof
-    { if mode <> In_block_comment then
+    { if mode = In_quoted_string then
         error lexbuf "unterminated escape sequence" ~delta:(-1);
       Other
     }
diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer0.ml
index 5ebc545a..b361acc2 100644
--- a/src/usexp/lexer0.ml
+++ b/src/usexp/lexer0.ml
@@ -31,24 +31,10 @@ let error ?(delta=0) lexbuf message =
 
 let escaped_buf = Buffer.create 256
 
-(* The difference between the old and new syntax is that the old
-   syntax allows backslash following by any characters other than 'n',
-   'x', ... and interpret it as it. The new syntax is stricter in
-   order to allow introducing new escape sequence in the future if
-   needed. *)
-type escape_mode =
-  | In_block_comment (* Inside #|...|# comments (old syntax) *)
-  | Old_syntax
-  | New_syntax
-
 type escape_sequence =
   | Newline
   | Other
 
-type block_string_line_kind =
-  | With_escape_sequences
-  | Raw
-
 let eval_decimal_char c = Char.code c - Char.code '0'
 
 let eval_decimal_escape c1 c2 c3 =
diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer0.mli
index fc15c06a..b442888d 100644
--- a/src/usexp/lexer0.mli
+++ b/src/usexp/lexer0.mli
@@ -24,19 +24,10 @@ val escaped_buf : Buffer.t
 
 exception Error of Error.t
 
-type escape_mode =
-  | In_block_comment (* Inside #|...|# comments (old syntax) *)
-  | Old_syntax
-  | New_syntax
-
 type escape_sequence =
   | Newline
   | Other
 
-type block_string_line_kind =
-  | With_escape_sequences
-  | Raw
-
 val eval_decimal_char : char -> int
 
 val eval_decimal_escape : char -> char -> char -> int

From d6778a50a24b8d15efd4b0288d8eb144f4618da1 Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 20:16:12 +0630
Subject: [PATCH 5/7] Rename Lexer0 to Lexer_shared

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/dune_lexer.mll                   | 2 +-
 src/usexp/jbuild_lexer.mll                 | 2 +-
 src/usexp/lexer.ml                         | 2 +-
 src/usexp/{lexer0.ml => lexer_shared.ml}   | 1 -
 src/usexp/{lexer0.mli => lexer_shared.mli} | 0
 5 files changed, 3 insertions(+), 4 deletions(-)
 rename src/usexp/{lexer0.ml => lexer_shared.ml} (99%)
 rename src/usexp/{lexer0.mli => lexer_shared.mli} (100%)

diff --git a/src/usexp/dune_lexer.mll b/src/usexp/dune_lexer.mll
index 2c3a10a2..4803c4de 100644
--- a/src/usexp/dune_lexer.mll
+++ b/src/usexp/dune_lexer.mll
@@ -1,5 +1,5 @@
 {
-open Lexer0
+open Lexer_shared
 
 type block_string_line_kind =
   | With_escape_sequences
diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll
index 4cfade63..0cc57a5f 100644
--- a/src/usexp/jbuild_lexer.mll
+++ b/src/usexp/jbuild_lexer.mll
@@ -1,5 +1,5 @@
 {
-  open Lexer0
+  open Lexer_shared
 
 (* The difference between the old and new syntax is that the old
    syntax allows backslash following by any characters other than 'n',
diff --git a/src/usexp/lexer.ml b/src/usexp/lexer.ml
index aff4e6d2..50f62541 100644
--- a/src/usexp/lexer.ml
+++ b/src/usexp/lexer.ml
@@ -1,4 +1,4 @@
-include Lexer0
+include Lexer_shared
 
 let token = Dune_lexer.token
 let jbuild_token = Jbuild_lexer.token
diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer_shared.ml
similarity index 99%
rename from src/usexp/lexer0.ml
rename to src/usexp/lexer_shared.ml
index b361acc2..ee1cf5ba 100644
--- a/src/usexp/lexer0.ml
+++ b/src/usexp/lexer_shared.ml
@@ -1,4 +1,3 @@
-
 module Token = struct
   type t =
     | Atom          of Atom.t
diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer_shared.mli
similarity index 100%
rename from src/usexp/lexer0.mli
rename to src/usexp/lexer_shared.mli

From 15ffc107ff12dbe5fba85ba7a3f70c783ef92aed Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 20:21:23 +0630
Subject: [PATCH 6/7] Add mli's to {dune,jbuild}_lexer

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/dune_lexer.mli   | 1 +
 src/usexp/jbuild_lexer.mli | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 src/usexp/dune_lexer.mli
 create mode 100644 src/usexp/jbuild_lexer.mli

diff --git a/src/usexp/dune_lexer.mli b/src/usexp/dune_lexer.mli
new file mode 100644
index 00000000..5d1e70f7
--- /dev/null
+++ b/src/usexp/dune_lexer.mli
@@ -0,0 +1 @@
+val token : Lexer_shared.t
diff --git a/src/usexp/jbuild_lexer.mli b/src/usexp/jbuild_lexer.mli
new file mode 100644
index 00000000..5d1e70f7
--- /dev/null
+++ b/src/usexp/jbuild_lexer.mli
@@ -0,0 +1 @@
+val token : Lexer_shared.t

From 5a46dc1998a18f145f9b7d0df0ac98400181f5ed Mon Sep 17 00:00:00 2001
From: Rudi Grinberg <rudi.grinberg@gmail.com>
Date: Wed, 20 Jun 2018 20:21:40 +0630
Subject: [PATCH 7/7] Add jbuild_lexer.boot to speed up bootstrap

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
---
 src/usexp/jbuild_lexer.boot.ml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 src/usexp/jbuild_lexer.boot.ml

diff --git a/src/usexp/jbuild_lexer.boot.ml b/src/usexp/jbuild_lexer.boot.ml
new file mode 100644
index 00000000..e0671e6d
--- /dev/null
+++ b/src/usexp/jbuild_lexer.boot.ml
@@ -0,0 +1 @@
+let token _ = failwith "Unused during bootstrap"