Split dune and jbuild lexers

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
2018-06-20 19:29:16 +06:30 · 2018-06-20 19:29:16 +06:30 · e3aa13424d
parent 411552e4c7
commit e3aa13424d
6 changed files with 361 additions and 218 deletions
--- a/src/usexp/dune
+++ b/src/usexp/dune
@ -3,4 +3,4 @@
  (synopsis "[Internal] S-expression library")
  (public_name dune.usexp)))

-(ocamllex (lexer))
+(ocamllex (dune_lexer jbuild_lexer))
--- a/src/usexp/dune_lexer.mll
+++ b/src/usexp/dune_lexer.mll
@ -1,71 +1,5 @@
 {
-module Token = struct
-  type t =
-    | Atom          of Atom.t
-    | Quoted_string of string
-    | Lparen
-    | Rparen
-    | Sexp_comment
-    | Eof
-end
-
-type t = Lexing.lexbuf -> Token.t
-
-module Error = struct
-  type t =
-    { start   : Lexing.position
-    ; stop    : Lexing.position
-    ; message : string
-    }
-end
-
-exception Error of Error.t
-
-let error ?(delta=0) lexbuf message =
-  let start = Lexing.lexeme_start_p lexbuf in
-  raise
-    (Error { start = { start with pos_cnum = start.pos_cnum + delta }
-           ; stop  = Lexing.lexeme_end_p   lexbuf
-           ; message
-           })
-
-(* The difference between the old and new syntax is that the old
-   syntax allows backslash following by any characters other than 'n',
-   'x', ... and interpret it as it. The new syntax is stricter in
-   order to allow introducing new escape sequence in the future if
-   needed. *)
-type escape_mode =
-  | In_block_comment (* Inside #|...|# comments (old syntax) *)
-  | Old_syntax
-  | New_syntax
-
-let eval_decimal_char c = Char.code c - Char.code '0'
-
-let eval_decimal_escape c1 c2 c3 =
-  (eval_decimal_char c1) * 100 +
-  (eval_decimal_char c2) * 10  +
-  (eval_decimal_char c3)
-
-let eval_hex_char c =
-  match c with
-  | '0'..'9' -> Char.code c - Char.code '0'
-  | 'a'..'f' -> Char.code c - Char.code 'a' + 10
-  | 'A'..'F' -> Char.code c - Char.code 'A' + 10
-  | _ -> -1
-
-let eval_hex_escape c1 c2 =
-  (eval_hex_char c1) * 16 +
-  (eval_hex_char c2)
-
-type escape_sequence =
-  | Newline
-  | Other
-
-let escaped_buf = Buffer.create 256
-
-type block_string_line_kind =
-  | With_escape_sequences
-  | Raw
+  open Lexer0
 }

 let comment   = ';' [^ '\n' '\r']*
@ -74,160 +8,11 @@ let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']

-let atom_char_jbuild =
-  [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
 let atom_char_dune =
  [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']

-(* rule for jbuild files *)
-rule jbuild_token = parse
-  | newline
-    { Lexing.new_line lexbuf; jbuild_token lexbuf }
-  | blank+ | comment
-    { jbuild_token lexbuf }
-  | '('
-    { Token.Lparen }
-  | ')'
-    { Rparen }
-  | '"'
-    { Buffer.clear escaped_buf;
-      let start = Lexing.lexeme_start_p lexbuf in
-      let s = quoted_string Old_syntax lexbuf in
-      lexbuf.lex_start_p <- start;
-      Quoted_string s
-    }
-  | "#|"
-    { jbuild_block_comment lexbuf;
-      jbuild_token lexbuf
-    }
-  | "#;"
-    { Sexp_comment }
-  | eof
-    { Eof }
-  | ""
-    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
-
-and jbuild_atom acc start = parse
-  | '#'+ '|'
-    { lexbuf.lex_start_p <- start;
-      error lexbuf "jbuild_atoms cannot contain #|"
-    }
-  | '|'+ '#'
-    { lexbuf.lex_start_p <- start;
-      error lexbuf "jbuild_atoms cannot contain |#"
-    }
-  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
-    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
-    }
-  | ""
-    { if acc = "" then
-        error lexbuf "Internal error in the S-expression parser, \
-                      please report upstream.";
-      lexbuf.lex_start_p <- start;
-      Token.Atom (Atom.of_string acc)
-    }
-
-and quoted_string mode = parse
-  | '"'
-    { Buffer.contents escaped_buf }
-  | '\\'
-    { match escape_sequence mode lexbuf with
-      | Newline -> quoted_string_after_escaped_newline mode lexbuf
-      | Other   -> quoted_string                       mode lexbuf
-    }
-  | newline as s
-    { Lexing.new_line lexbuf;
-      Buffer.add_string escaped_buf s;
-      quoted_string mode lexbuf
-    }
-  | _ as c
-    { Buffer.add_char escaped_buf c;
-      quoted_string mode lexbuf
-    }
-  | eof
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated quoted string";
-      Buffer.contents escaped_buf
-    }
-
-and quoted_string_after_escaped_newline mode = parse
-  | [' ' '\t']*
-    { quoted_string mode lexbuf }
-
-and escape_sequence mode = parse
-  | newline
-    { Lexing.new_line lexbuf;
-      Newline }
-  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
-    { let c =
-        match c with
-        | 'n' -> '\n'
-        | 'r' -> '\r'
-        | 'b' -> '\b'
-        | 't' -> '\t'
-        | _   -> c
-      in
-      Buffer.add_char escaped_buf c;
-      Other
-    }
-  | (digit as c1) (digit as c2) (digit as c3)
-    { let v = eval_decimal_escape c1 c2 c3 in
-      if mode <> In_block_comment && v > 255 then
-        error lexbuf "escape sequence in quoted string out of range"
-          ~delta:(-1);
-      Buffer.add_char escaped_buf (Char.chr v);
-      Other
-    }
-  | digit* as s
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_string escaped_buf s;
-      Other
-    }
-  | 'x' (hexdigit as c1) (hexdigit as c2)
-    { let v = eval_hex_escape c1 c2 in
-      Buffer.add_char escaped_buf (Char.chr v);
-      Other
-    }
-  | 'x' hexdigit* as s
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_string escaped_buf s;
-      Other
-    }
-  | _ as c
-    { if mode = New_syntax then
-        error lexbuf "unknown escape sequence" ~delta:(-1);
-      Buffer.add_char escaped_buf '\\';
-      Buffer.add_char escaped_buf c;
-      Other
-    }
-  | eof
-    { if mode <> In_block_comment then
-        error lexbuf "unterminated escape sequence" ~delta:(-1);
-      Other
-    }
-
-and jbuild_block_comment = parse
-  | '"'
-    { Buffer.clear escaped_buf;
-      ignore (quoted_string In_block_comment lexbuf : string);
-      jbuild_block_comment lexbuf
-    }
-  | "|#"
-    { ()
-    }
-  | eof
-    { error lexbuf "unterminated block comment"
-    }
-  | _
-    { jbuild_block_comment lexbuf
-    }
-
 (* rule for dune files *)
-and token = parse
+rule token = parse
  | newline
    { Lexing.new_line lexbuf; token lexbuf }
  | blank+ | comment
@ -316,3 +101,86 @@ and raw_block_string = parse
  | eof
    { Buffer.contents escaped_buf
    }
+
+and quoted_string mode = parse
+  | '"'
+    { Buffer.contents escaped_buf }
+  | '\\'
+    { match escape_sequence mode lexbuf with
+      | Newline -> quoted_string_after_escaped_newline mode lexbuf
+      | Other   -> quoted_string                       mode lexbuf
+    }
+  | newline as s
+    { Lexing.new_line lexbuf;
+      Buffer.add_string escaped_buf s;
+      quoted_string mode lexbuf
+    }
+  | _ as c
+    { Buffer.add_char escaped_buf c;
+      quoted_string mode lexbuf
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated quoted string";
+      Buffer.contents escaped_buf
+    }
+
+and escape_sequence mode = parse
+  | newline
+    { Lexing.new_line lexbuf;
+      Newline }
+  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
+    { let c =
+        match c with
+        | 'n' -> '\n'
+        | 'r' -> '\r'
+        | 'b' -> '\b'
+        | 't' -> '\t'
+        | _   -> c
+      in
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | (digit as c1) (digit as c2) (digit as c3)
+    { let v = eval_decimal_escape c1 c2 c3 in
+      if mode <> In_block_comment && v > 255 then
+        error lexbuf "escape sequence in quoted string out of range"
+          ~delta:(-1);
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | digit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | 'x' (hexdigit as c1) (hexdigit as c2)
+    { let v = eval_hex_escape c1 c2 in
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | 'x' hexdigit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | _ as c
+    { if mode = New_syntax then
+        error lexbuf "unknown escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated escape sequence" ~delta:(-1);
+      Other
+    }
+
+and quoted_string_after_escaped_newline mode = parse
+  | [' ' '\t']*
+    { quoted_string mode lexbuf }
--- a/src/usexp/jbuild_lexer.mll
+++ b/src/usexp/jbuild_lexer.mll
@ -0,0 +1,159 @@
+{
+  open Lexer0
+}
+
+let comment   = ';' [^ '\n' '\r']*
+let newline   = '\r'? '\n'
+let blank     = [' ' '\t' '\012']
+let digit     = ['0'-'9']
+let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
+
+let atom_char_jbuild =
+  [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
+
+(* rule for jbuild files *)
+rule token = parse
+  | newline
+    { Lexing.new_line lexbuf; token lexbuf }
+  | blank+ | comment
+    { token lexbuf }
+  | '('
+    { Token.Lparen }
+  | ')'
+    { Rparen }
+  | '"'
+    { Buffer.clear escaped_buf;
+      let start = Lexing.lexeme_start_p lexbuf in
+      let s = quoted_string Old_syntax lexbuf in
+      lexbuf.lex_start_p <- start;
+      Quoted_string s
+    }
+  | "#|"
+    { jbuild_block_comment lexbuf;
+      token lexbuf
+    }
+  | "#;"
+    { Sexp_comment }
+  | eof
+    { Eof }
+  | ""
+    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
+
+and jbuild_atom acc start = parse
+  | '#'+ '|'
+    { lexbuf.lex_start_p <- start;
+      error lexbuf "jbuild_atoms cannot contain #|"
+    }
+  | '|'+ '#'
+    { lexbuf.lex_start_p <- start;
+      error lexbuf "jbuild_atoms cannot contain |#"
+    }
+  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
+    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
+    }
+  | ""
+    { if acc = "" then
+        error lexbuf "Internal error in the S-expression parser, \
+                      please report upstream.";
+      lexbuf.lex_start_p <- start;
+      Token.Atom (Atom.of_string acc)
+    }
+
+and quoted_string mode = parse
+  | '"'
+    { Buffer.contents escaped_buf }
+  | '\\'
+    { match escape_sequence mode lexbuf with
+      | Newline -> quoted_string_after_escaped_newline mode lexbuf
+      | Other   -> quoted_string                       mode lexbuf
+    }
+  | newline as s
+    { Lexing.new_line lexbuf;
+      Buffer.add_string escaped_buf s;
+      quoted_string mode lexbuf
+    }
+  | _ as c
+    { Buffer.add_char escaped_buf c;
+      quoted_string mode lexbuf
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated quoted string";
+      Buffer.contents escaped_buf
+    }
+
+and quoted_string_after_escaped_newline mode = parse
+  | [' ' '\t']*
+    { quoted_string mode lexbuf }
+
+and jbuild_block_comment = parse
+  | '"'
+    { Buffer.clear escaped_buf;
+      ignore (quoted_string In_block_comment lexbuf : string);
+      jbuild_block_comment lexbuf
+    }
+  | "|#"
+    { ()
+    }
+  | eof
+    { error lexbuf "unterminated block comment"
+    }
+  | _
+    { jbuild_block_comment lexbuf
+    }
+
+and escape_sequence mode = parse
+  | newline
+    { Lexing.new_line lexbuf;
+      Newline }
+  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
+    { let c =
+        match c with
+        | 'n' -> '\n'
+        | 'r' -> '\r'
+        | 'b' -> '\b'
+        | 't' -> '\t'
+        | _   -> c
+      in
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | (digit as c1) (digit as c2) (digit as c3)
+    { let v = eval_decimal_escape c1 c2 c3 in
+      if mode <> In_block_comment && v > 255 then
+        error lexbuf "escape sequence in quoted string out of range"
+          ~delta:(-1);
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | digit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | 'x' (hexdigit as c1) (hexdigit as c2)
+    { let v = eval_hex_escape c1 c2 in
+      Buffer.add_char escaped_buf (Char.chr v);
+      Other
+    }
+  | 'x' hexdigit* as s
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_string escaped_buf s;
+      Other
+    }
+  | _ as c
+    { if mode = New_syntax then
+        error lexbuf "unknown escape sequence" ~delta:(-1);
+      Buffer.add_char escaped_buf '\\';
+      Buffer.add_char escaped_buf c;
+      Other
+    }
+  | eof
+    { if mode <> In_block_comment then
+        error lexbuf "unterminated escape sequence" ~delta:(-1);
+      Other
+    }
--- a/src/usexp/lexer.ml
+++ b/src/usexp/lexer.ml
@ -0,0 +1,4 @@
+include Lexer0
+
+let token = Dune_lexer.token
+let jbuild_token = Jbuild_lexer.token
--- a/src/usexp/lexer0.ml
+++ b/src/usexp/lexer0.ml
@ -0,0 +1,68 @@
+
+module Token = struct
+  type t =
+    | Atom          of Atom.t
+    | Quoted_string of string
+    | Lparen
+    | Rparen
+    | Sexp_comment
+    | Eof
+end
+
+type t = Lexing.lexbuf -> Token.t
+
+module Error = struct
+  type t =
+    { start   : Lexing.position
+    ; stop    : Lexing.position
+    ; message : string
+    }
+end
+
+exception Error of Error.t
+
+let error ?(delta=0) lexbuf message =
+  let start = Lexing.lexeme_start_p lexbuf in
+  raise
+    (Error { start = { start with pos_cnum = start.pos_cnum + delta }
+           ; stop  = Lexing.lexeme_end_p   lexbuf
+           ; message
+           })
+
+let escaped_buf = Buffer.create 256
+
+(* The difference between the old and new syntax is that the old
+   syntax allows backslash following by any characters other than 'n',
+   'x', ... and interpret it as it. The new syntax is stricter in
+   order to allow introducing new escape sequence in the future if
+   needed. *)
+type escape_mode =
+  | In_block_comment (* Inside #|...|# comments (old syntax) *)
+  | Old_syntax
+  | New_syntax
+
+type escape_sequence =
+  | Newline
+  | Other
+
+type block_string_line_kind =
+  | With_escape_sequences
+  | Raw
+
+let eval_decimal_char c = Char.code c - Char.code '0'
+
+let eval_decimal_escape c1 c2 c3 =
+  (eval_decimal_char c1) * 100 +
+  (eval_decimal_char c2) * 10  +
+  (eval_decimal_char c3)
+
+let eval_hex_char c =
+  match c with
+  | '0'..'9' -> Char.code c - Char.code '0'
+  | 'a'..'f' -> Char.code c - Char.code 'a' + 10
+  | 'A'..'F' -> Char.code c - Char.code 'A' + 10
+  | _ -> -1
+
+let eval_hex_escape c1 c2 =
+  (eval_hex_char c1) * 16 +
+  (eval_hex_char c2)
--- a/src/usexp/lexer0.mli
+++ b/src/usexp/lexer0.mli
@ -0,0 +1,44 @@
+module Token : sig
+  type t =
+    | Atom          of Atom.t
+    | Quoted_string of string
+    | Lparen
+    | Rparen
+    | Sexp_comment
+    | Eof
+end
+
+type t = Lexing.lexbuf -> Token.t
+
+module Error : sig
+  type t =
+    { start   : Lexing.position
+    ; stop    : Lexing.position
+    ; message : string
+    }
+end
+
+val error : ?delta:int -> Lexing.lexbuf -> string -> 'a
+
+val escaped_buf : Buffer.t
+
+exception Error of Error.t
+
+type escape_mode =
+  | In_block_comment (* Inside #|...|# comments (old syntax) *)
+  | Old_syntax
+  | New_syntax
+
+type escape_sequence =
+  | Newline
+  | Other
+
+type block_string_line_kind =
+  | With_escape_sequences
+  | Raw
+
+val eval_decimal_char : char -> int
+
+val eval_decimal_escape : char -> char -> char -> int
+
+val eval_hex_escape : char -> char -> int