Split dune and jbuild lexers

Signed-off-by: Rudi Grinberg <rudi.grinberg@gmail.com>
2018-06-20 19:29:16 +06:30 · 2018-06-20 19:29:16 +06:30 · e3aa13424d
parent 411552e4c7
commit e3aa13424d
6 changed files with 361 additions and 218 deletions
--- a/src/usexp/dune
+++ b/src/usexp/dune
@ -3,4 +3,4 @@
  (synopsis "[Internal] S-expression library")
  (public_name dune.usexp)))
-(ocamllex (lexer))
+(ocamllex (dune_lexer jbuild_lexer))
--- a/src/usexp/dune_lexer.mll
+++ b/src/usexp/dune_lexer.mll
@ -1,71 +1,5 @@
 {
-module Token = struct
+  open Lexer0
  type t =
    | Atom          of Atom.t
    | Quoted_string of string
    | Lparen
    | Rparen
    | Sexp_comment
    | Eof
 end
 type t = Lexing.lexbuf -> Token.t
 module Error = struct
  type t =
    { start   : Lexing.position
    ; stop    : Lexing.position
    ; message : string
    }
 end
 exception Error of Error.t
 let error ?(delta=0) lexbuf message =
  let start = Lexing.lexeme_start_p lexbuf in
  raise
    (Error { start = { start with pos_cnum = start.pos_cnum + delta }
           ; stop  = Lexing.lexeme_end_p   lexbuf
           ; message
           })
 (* The difference between the old and new syntax is that the old
   syntax allows backslash following by any characters other than 'n',
   'x', ... and interpret it as it. The new syntax is stricter in
   order to allow introducing new escape sequence in the future if
   needed. *)
 type escape_mode =
  | In_block_comment (* Inside #|...|# comments (old syntax) *)
  | Old_syntax
  | New_syntax
 let eval_decimal_char c = Char.code c - Char.code '0'
 let eval_decimal_escape c1 c2 c3 =
  (eval_decimal_char c1) * 100 +
  (eval_decimal_char c2) * 10  +
  (eval_decimal_char c3)
 let eval_hex_char c =
  match c with
  | '0'..'9' -> Char.code c - Char.code '0'
  | 'a'..'f' -> Char.code c - Char.code 'a' + 10
  | 'A'..'F' -> Char.code c - Char.code 'A' + 10
  | _ -> -1
 let eval_hex_escape c1 c2 =
  (eval_hex_char c1) * 16 +
  (eval_hex_char c2)
 type escape_sequence =
  | Newline
  | Other
 let escaped_buf = Buffer.create 256
 type block_string_line_kind =
  | With_escape_sequences
  | Raw
 }
 let comment   = ';' [^ '\n' '\r']*
@ -74,160 +8,11 @@ let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 let atom_char_jbuild =
  [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
 let atom_char_dune =
  [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
 (* rule for jbuild files *)
 rule jbuild_token = parse
  | newline
    { Lexing.new_line lexbuf; jbuild_token lexbuf }
  | blank+ | comment
    { jbuild_token lexbuf }
  | '('
    { Token.Lparen }
  | ')'
    { Rparen }
  | '"'
    { Buffer.clear escaped_buf;
      let start = Lexing.lexeme_start_p lexbuf in
      let s = quoted_string Old_syntax lexbuf in
      lexbuf.lex_start_p <- start;
      Quoted_string s
    }
  | "#|"
    { jbuild_block_comment lexbuf;
      jbuild_token lexbuf
    }
  | "#;"
    { Sexp_comment }
  | eof
    { Eof }
  | ""
    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
 and jbuild_atom acc start = parse
  | '#'+ '|'
    { lexbuf.lex_start_p <- start;
      error lexbuf "jbuild_atoms cannot contain #|"
    }
  | '|'+ '#'
    { lexbuf.lex_start_p <- start;
      error lexbuf "jbuild_atoms cannot contain |#"
    }
  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
    }
  | ""
    { if acc = "" then
        error lexbuf "Internal error in the S-expression parser, \
                      please report upstream.";
      lexbuf.lex_start_p <- start;
      Token.Atom (Atom.of_string acc)
    }
 and quoted_string mode = parse
  | '"'
    { Buffer.contents escaped_buf }
  | '\\'
    { match escape_sequence mode lexbuf with
      | Newline -> quoted_string_after_escaped_newline mode lexbuf
      | Other   -> quoted_string                       mode lexbuf
    }
  | newline as s
    { Lexing.new_line lexbuf;
      Buffer.add_string escaped_buf s;
      quoted_string mode lexbuf
    }
  | _ as c
    { Buffer.add_char escaped_buf c;
      quoted_string mode lexbuf
    }
  | eof
    { if mode <> In_block_comment then
        error lexbuf "unterminated quoted string";
      Buffer.contents escaped_buf
    }
 and quoted_string_after_escaped_newline mode = parse
  | [' ' '\t']*
    { quoted_string mode lexbuf }
 and escape_sequence mode = parse
  | newline
    { Lexing.new_line lexbuf;
      Newline }
  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
    { let c =
        match c with
        | 'n' -> '\n'
        | 'r' -> '\r'
        | 'b' -> '\b'
        | 't' -> '\t'
        | _   -> c
      in
      Buffer.add_char escaped_buf c;
      Other
    }
  | (digit as c1) (digit as c2) (digit as c3)
    { let v = eval_decimal_escape c1 c2 c3 in
      if mode <> In_block_comment && v > 255 then
        error lexbuf "escape sequence in quoted string out of range"
          ~delta:(-1);
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | digit* as s
    { if mode <> In_block_comment then
        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | 'x' (hexdigit as c1) (hexdigit as c2)
    { let v = eval_hex_escape c1 c2 in
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | 'x' hexdigit* as s
    { if mode <> In_block_comment then
        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | _ as c
    { if mode = New_syntax then
        error lexbuf "unknown escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_char escaped_buf c;
      Other
    }
  | eof
    { if mode <> In_block_comment then
        error lexbuf "unterminated escape sequence" ~delta:(-1);
      Other
    }
 and jbuild_block_comment = parse
  | '"'
    { Buffer.clear escaped_buf;
      ignore (quoted_string In_block_comment lexbuf : string);
      jbuild_block_comment lexbuf
    }
  | "|#"
    { ()
    }
  | eof
    { error lexbuf "unterminated block comment"
    }
  | _
    { jbuild_block_comment lexbuf
    }
 (* rule for dune files *)
-and token = parse
+rule token = parse
  | newline
    { Lexing.new_line lexbuf; token lexbuf }
  | blank+ | comment
@ -316,3 +101,86 @@ and raw_block_string = parse
  | eof
    { Buffer.contents escaped_buf
    }
 and quoted_string mode = parse
  | '"'
    { Buffer.contents escaped_buf }
  | '\\'
    { match escape_sequence mode lexbuf with
      | Newline -> quoted_string_after_escaped_newline mode lexbuf
      | Other   -> quoted_string                       mode lexbuf
    }
  | newline as s
    { Lexing.new_line lexbuf;
      Buffer.add_string escaped_buf s;
      quoted_string mode lexbuf
    }
  | _ as c
    { Buffer.add_char escaped_buf c;
      quoted_string mode lexbuf
    }
  | eof
    { if mode <> In_block_comment then
        error lexbuf "unterminated quoted string";
      Buffer.contents escaped_buf
    }
 and escape_sequence mode = parse
  | newline
    { Lexing.new_line lexbuf;
      Newline }
  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
    { let c =
        match c with
        | 'n' -> '\n'
        | 'r' -> '\r'
        | 'b' -> '\b'
        | 't' -> '\t'
        | _   -> c
      in
      Buffer.add_char escaped_buf c;
      Other
    }
  | (digit as c1) (digit as c2) (digit as c3)
    { let v = eval_decimal_escape c1 c2 c3 in
      if mode <> In_block_comment && v > 255 then
        error lexbuf "escape sequence in quoted string out of range"
          ~delta:(-1);
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | digit* as s
    { if mode <> In_block_comment then
        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | 'x' (hexdigit as c1) (hexdigit as c2)
    { let v = eval_hex_escape c1 c2 in
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | 'x' hexdigit* as s
    { if mode <> In_block_comment then
        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | _ as c
    { if mode = New_syntax then
        error lexbuf "unknown escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_char escaped_buf c;
      Other
    }
  | eof
    { if mode <> In_block_comment then
        error lexbuf "unterminated escape sequence" ~delta:(-1);
      Other
    }
 and quoted_string_after_escaped_newline mode = parse
  | [' ' '\t']*
    { quoted_string mode lexbuf }
--- a/src/usexp/jbuild_lexer.mll
+++ b/src/usexp/jbuild_lexer.mll
@ -0,0 +1,159 @@
 {
  open Lexer0
 }
 let comment   = ';' [^ '\n' '\r']*
 let newline   = '\r'? '\n'
 let blank     = [' ' '\t' '\012']
 let digit     = ['0'-'9']
 let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 let atom_char_jbuild =
  [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012']
 (* rule for jbuild files *)
 rule token = parse
  | newline
    { Lexing.new_line lexbuf; token lexbuf }
  | blank+ | comment
    { token lexbuf }
  | '('
    { Token.Lparen }
  | ')'
    { Rparen }
  | '"'
    { Buffer.clear escaped_buf;
      let start = Lexing.lexeme_start_p lexbuf in
      let s = quoted_string Old_syntax lexbuf in
      lexbuf.lex_start_p <- start;
      Quoted_string s
    }
  | "#|"
    { jbuild_block_comment lexbuf;
      token lexbuf
    }
  | "#;"
    { Sexp_comment }
  | eof
    { Eof }
  | ""
    { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf }
 and jbuild_atom acc start = parse
  | '#'+ '|'
    { lexbuf.lex_start_p <- start;
      error lexbuf "jbuild_atoms cannot contain #|"
    }
  | '|'+ '#'
    { lexbuf.lex_start_p <- start;
      error lexbuf "jbuild_atoms cannot contain |#"
    }
  | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s
    { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf
    }
  | ""
    { if acc = "" then
        error lexbuf "Internal error in the S-expression parser, \
                      please report upstream.";
      lexbuf.lex_start_p <- start;
      Token.Atom (Atom.of_string acc)
    }
 and quoted_string mode = parse
  | '"'
    { Buffer.contents escaped_buf }
  | '\\'
    { match escape_sequence mode lexbuf with
      | Newline -> quoted_string_after_escaped_newline mode lexbuf
      | Other   -> quoted_string                       mode lexbuf
    }
  | newline as s
    { Lexing.new_line lexbuf;
      Buffer.add_string escaped_buf s;
      quoted_string mode lexbuf
    }
  | _ as c
    { Buffer.add_char escaped_buf c;
      quoted_string mode lexbuf
    }
  | eof
    { if mode <> In_block_comment then
        error lexbuf "unterminated quoted string";
      Buffer.contents escaped_buf
    }
 and quoted_string_after_escaped_newline mode = parse
  | [' ' '\t']*
    { quoted_string mode lexbuf }
 and jbuild_block_comment = parse
  | '"'
    { Buffer.clear escaped_buf;
      ignore (quoted_string In_block_comment lexbuf : string);
      jbuild_block_comment lexbuf
    }
  | "|#"
    { ()
    }
  | eof
    { error lexbuf "unterminated block comment"
    }
  | _
    { jbuild_block_comment lexbuf
    }
 and escape_sequence mode = parse
  | newline
    { Lexing.new_line lexbuf;
      Newline }
  | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c
    { let c =
        match c with
        | 'n' -> '\n'
        | 'r' -> '\r'
        | 'b' -> '\b'
        | 't' -> '\t'
        | _   -> c
      in
      Buffer.add_char escaped_buf c;
      Other
    }
  | (digit as c1) (digit as c2) (digit as c3)
    { let v = eval_decimal_escape c1 c2 c3 in
      if mode <> In_block_comment && v > 255 then
        error lexbuf "escape sequence in quoted string out of range"
          ~delta:(-1);
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | digit* as s
    { if mode <> In_block_comment then
        error lexbuf "unterminated decimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | 'x' (hexdigit as c1) (hexdigit as c2)
    { let v = eval_hex_escape c1 c2 in
      Buffer.add_char escaped_buf (Char.chr v);
      Other
    }
  | 'x' hexdigit* as s
    { if mode <> In_block_comment then
        error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_string escaped_buf s;
      Other
    }
  | _ as c
    { if mode = New_syntax then
        error lexbuf "unknown escape sequence" ~delta:(-1);
      Buffer.add_char escaped_buf '\\';
      Buffer.add_char escaped_buf c;
      Other
    }
  | eof
    { if mode <> In_block_comment then
        error lexbuf "unterminated escape sequence" ~delta:(-1);
      Other
    }
--- a/src/usexp/lexer.ml
+++ b/src/usexp/lexer.ml
@ -0,0 +1,4 @@
 include Lexer0
 let token = Dune_lexer.token
 let jbuild_token = Jbuild_lexer.token
--- a/src/usexp/lexer0.ml
+++ b/src/usexp/lexer0.ml
@ -0,0 +1,68 @@
 module Token = struct
  type t =
    | Atom          of Atom.t
    | Quoted_string of string
    | Lparen
    | Rparen
    | Sexp_comment
    | Eof
 end
 type t = Lexing.lexbuf -> Token.t
 module Error = struct
  type t =
    { start   : Lexing.position
    ; stop    : Lexing.position
    ; message : string
    }
 end
 exception Error of Error.t
 let error ?(delta=0) lexbuf message =
  let start = Lexing.lexeme_start_p lexbuf in
  raise
    (Error { start = { start with pos_cnum = start.pos_cnum + delta }
           ; stop  = Lexing.lexeme_end_p   lexbuf
           ; message
           })
 let escaped_buf = Buffer.create 256
 (* The difference between the old and new syntax is that the old
   syntax allows backslash following by any characters other than 'n',
   'x', ... and interpret it as it. The new syntax is stricter in
   order to allow introducing new escape sequence in the future if
   needed. *)
 type escape_mode =
  | In_block_comment (* Inside #|...|# comments (old syntax) *)
  | Old_syntax
  | New_syntax
 type escape_sequence =
  | Newline
  | Other
 type block_string_line_kind =
  | With_escape_sequences
  | Raw
 let eval_decimal_char c = Char.code c - Char.code '0'
 let eval_decimal_escape c1 c2 c3 =
  (eval_decimal_char c1) * 100 +
  (eval_decimal_char c2) * 10  +
  (eval_decimal_char c3)
 let eval_hex_char c =
  match c with
  | '0'..'9' -> Char.code c - Char.code '0'
  | 'a'..'f' -> Char.code c - Char.code 'a' + 10
  | 'A'..'F' -> Char.code c - Char.code 'A' + 10
  | _ -> -1
 let eval_hex_escape c1 c2 =
  (eval_hex_char c1) * 16 +
  (eval_hex_char c2)
--- a/src/usexp/lexer0.mli
+++ b/src/usexp/lexer0.mli
@ -0,0 +1,44 @@
 module Token : sig
  type t =
    | Atom          of Atom.t
    | Quoted_string of string
    | Lparen
    | Rparen
    | Sexp_comment
    | Eof
 end
 type t = Lexing.lexbuf -> Token.t
 module Error : sig
  type t =
    { start   : Lexing.position
    ; stop    : Lexing.position
    ; message : string
    }
 end
 val error : ?delta:int -> Lexing.lexbuf -> string -> 'a
 val escaped_buf : Buffer.t
 exception Error of Error.t
 type escape_mode =
  | In_block_comment (* Inside #|...|# comments (old syntax) *)
  | Old_syntax
  | New_syntax
 type escape_sequence =
  | Newline
  | Other
 type block_string_line_kind =
  | With_escape_sequences
  | Raw
 val eval_decimal_char : char -> int
 val eval_decimal_escape : char -> char -> char -> int
 val eval_hex_escape : char -> char -> int