diff --git a/src/usexp/dune b/src/usexp/dune index a26d42f0..bd71f71a 100644 --- a/src/usexp/dune +++ b/src/usexp/dune @@ -3,4 +3,4 @@ (synopsis "[Internal] S-expression library") (public_name dune.usexp))) -(ocamllex (lexer)) +(ocamllex (dune_lexer jbuild_lexer)) diff --git a/src/usexp/lexer.mll b/src/usexp/dune_lexer.mll similarity index 59% rename from src/usexp/lexer.mll rename to src/usexp/dune_lexer.mll index 2c9098b0..3bf74724 100644 --- a/src/usexp/lexer.mll +++ b/src/usexp/dune_lexer.mll @@ -1,71 +1,5 @@ { -module Token = struct - type t = - | Atom of Atom.t - | Quoted_string of string - | Lparen - | Rparen - | Sexp_comment - | Eof -end - -type t = Lexing.lexbuf -> Token.t - -module Error = struct - type t = - { start : Lexing.position - ; stop : Lexing.position - ; message : string - } -end - -exception Error of Error.t - -let error ?(delta=0) lexbuf message = - let start = Lexing.lexeme_start_p lexbuf in - raise - (Error { start = { start with pos_cnum = start.pos_cnum + delta } - ; stop = Lexing.lexeme_end_p lexbuf - ; message - }) - -(* The difference between the old and new syntax is that the old - syntax allows backslash following by any characters other than 'n', - 'x', ... and interpret it as it. The new syntax is stricter in - order to allow introducing new escape sequence in the future if - needed. *) -type escape_mode = - | In_block_comment (* Inside #|...|# comments (old syntax) *) - | Old_syntax - | New_syntax - -let eval_decimal_char c = Char.code c - Char.code '0' - -let eval_decimal_escape c1 c2 c3 = - (eval_decimal_char c1) * 100 + - (eval_decimal_char c2) * 10 + - (eval_decimal_char c3) - -let eval_hex_char c = - match c with - | '0'..'9' -> Char.code c - Char.code '0' - | 'a'..'f' -> Char.code c - Char.code 'a' + 10 - | 'A'..'F' -> Char.code c - Char.code 'A' + 10 - | _ -> -1 - -let eval_hex_escape c1 c2 = - (eval_hex_char c1) * 16 + - (eval_hex_char c2) - -type escape_sequence = - | Newline - | Other - -let escaped_buf = Buffer.create 256 - -type block_string_line_kind = - | With_escape_sequences - | Raw + open Lexer0 } let comment = ';' [^ '\n' '\r']* @@ -74,160 +8,11 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char_jbuild = - [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012'] let atom_char_dune = [^ '%' ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] -(* rule for jbuild files *) -rule jbuild_token = parse - | newline - { Lexing.new_line lexbuf; jbuild_token lexbuf } - | blank+ | comment - { jbuild_token lexbuf } - | '(' - { Token.Lparen } - | ')' - { Rparen } - | '"' - { Buffer.clear escaped_buf; - let start = Lexing.lexeme_start_p lexbuf in - let s = quoted_string Old_syntax lexbuf in - lexbuf.lex_start_p <- start; - Quoted_string s - } - | "#|" - { jbuild_block_comment lexbuf; - jbuild_token lexbuf - } - | "#;" - { Sexp_comment } - | eof - { Eof } - | "" - { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf } - -and jbuild_atom acc start = parse - | '#'+ '|' - { lexbuf.lex_start_p <- start; - error lexbuf "jbuild_atoms cannot contain #|" - } - | '|'+ '#' - { lexbuf.lex_start_p <- start; - error lexbuf "jbuild_atoms cannot contain |#" - } - | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s - { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf - } - | "" - { if acc = "" then - error lexbuf "Internal error in the S-expression parser, \ - please report upstream."; - lexbuf.lex_start_p <- start; - Token.Atom (Atom.of_string acc) - } - -and quoted_string mode = parse - | '"' - { Buffer.contents escaped_buf } - | '\\' - { match escape_sequence mode lexbuf with - | Newline -> quoted_string_after_escaped_newline mode lexbuf - | Other -> quoted_string mode lexbuf - } - | newline as s - { Lexing.new_line lexbuf; - Buffer.add_string escaped_buf s; - quoted_string mode lexbuf - } - | _ as c - { Buffer.add_char escaped_buf c; - quoted_string mode lexbuf - } - | eof - { if mode <> In_block_comment then - error lexbuf "unterminated quoted string"; - Buffer.contents escaped_buf - } - -and quoted_string_after_escaped_newline mode = parse - | [' ' '\t']* - { quoted_string mode lexbuf } - -and escape_sequence mode = parse - | newline - { Lexing.new_line lexbuf; - Newline } - | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c - { let c = - match c with - | 'n' -> '\n' - | 'r' -> '\r' - | 'b' -> '\b' - | 't' -> '\t' - | _ -> c - in - Buffer.add_char escaped_buf c; - Other - } - | (digit as c1) (digit as c2) (digit as c3) - { let v = eval_decimal_escape c1 c2 c3 in - if mode <> In_block_comment && v > 255 then - error lexbuf "escape sequence in quoted string out of range" - ~delta:(-1); - Buffer.add_char escaped_buf (Char.chr v); - Other - } - | digit* as s - { if mode <> In_block_comment then - error lexbuf "unterminated decimal escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_string escaped_buf s; - Other - } - | 'x' (hexdigit as c1) (hexdigit as c2) - { let v = eval_hex_escape c1 c2 in - Buffer.add_char escaped_buf (Char.chr v); - Other - } - | 'x' hexdigit* as s - { if mode <> In_block_comment then - error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_string escaped_buf s; - Other - } - | _ as c - { if mode = New_syntax then - error lexbuf "unknown escape sequence" ~delta:(-1); - Buffer.add_char escaped_buf '\\'; - Buffer.add_char escaped_buf c; - Other - } - | eof - { if mode <> In_block_comment then - error lexbuf "unterminated escape sequence" ~delta:(-1); - Other - } - -and jbuild_block_comment = parse - | '"' - { Buffer.clear escaped_buf; - ignore (quoted_string In_block_comment lexbuf : string); - jbuild_block_comment lexbuf - } - | "|#" - { () - } - | eof - { error lexbuf "unterminated block comment" - } - | _ - { jbuild_block_comment lexbuf - } - (* rule for dune files *) -and token = parse +rule token = parse | newline { Lexing.new_line lexbuf; token lexbuf } | blank+ | comment @@ -316,3 +101,86 @@ and raw_block_string = parse | eof { Buffer.contents escaped_buf } + +and quoted_string mode = parse + | '"' + { Buffer.contents escaped_buf } + | '\\' + { match escape_sequence mode lexbuf with + | Newline -> quoted_string_after_escaped_newline mode lexbuf + | Other -> quoted_string mode lexbuf + } + | newline as s + { Lexing.new_line lexbuf; + Buffer.add_string escaped_buf s; + quoted_string mode lexbuf + } + | _ as c + { Buffer.add_char escaped_buf c; + quoted_string mode lexbuf + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated quoted string"; + Buffer.contents escaped_buf + } + +and escape_sequence mode = parse + | newline + { Lexing.new_line lexbuf; + Newline } + | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c + { let c = + match c with + | 'n' -> '\n' + | 'r' -> '\r' + | 'b' -> '\b' + | 't' -> '\t' + | _ -> c + in + Buffer.add_char escaped_buf c; + Other + } + | (digit as c1) (digit as c2) (digit as c3) + { let v = eval_decimal_escape c1 c2 c3 in + if mode <> In_block_comment && v > 255 then + error lexbuf "escape sequence in quoted string out of range" + ~delta:(-1); + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | digit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated decimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | 'x' (hexdigit as c1) (hexdigit as c2) + { let v = eval_hex_escape c1 c2 in + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | 'x' hexdigit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | _ as c + { if mode = New_syntax then + error lexbuf "unknown escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_char escaped_buf c; + Other + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated escape sequence" ~delta:(-1); + Other + } + +and quoted_string_after_escaped_newline mode = parse + | [' ' '\t']* + { quoted_string mode lexbuf } diff --git a/src/usexp/jbuild_lexer.mll b/src/usexp/jbuild_lexer.mll new file mode 100644 index 00000000..51376bd9 --- /dev/null +++ b/src/usexp/jbuild_lexer.mll @@ -0,0 +1,159 @@ +{ + open Lexer0 +} + +let comment = ';' [^ '\n' '\r']* +let newline = '\r'? '\n' +let blank = [' ' '\t' '\012'] +let digit = ['0'-'9'] +let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] + +let atom_char_jbuild = + [^ ';' '(' ')' '"' ' ' '\t' '\r' '\n' '\012'] + +(* rule for jbuild files *) +rule token = parse + | newline + { Lexing.new_line lexbuf; token lexbuf } + | blank+ | comment + { token lexbuf } + | '(' + { Token.Lparen } + | ')' + { Rparen } + | '"' + { Buffer.clear escaped_buf; + let start = Lexing.lexeme_start_p lexbuf in + let s = quoted_string Old_syntax lexbuf in + lexbuf.lex_start_p <- start; + Quoted_string s + } + | "#|" + { jbuild_block_comment lexbuf; + token lexbuf + } + | "#;" + { Sexp_comment } + | eof + { Eof } + | "" + { jbuild_atom "" (Lexing.lexeme_start_p lexbuf) lexbuf } + +and jbuild_atom acc start = parse + | '#'+ '|' + { lexbuf.lex_start_p <- start; + error lexbuf "jbuild_atoms cannot contain #|" + } + | '|'+ '#' + { lexbuf.lex_start_p <- start; + error lexbuf "jbuild_atoms cannot contain |#" + } + | ('#'+ | '|'+ | (atom_char_jbuild # ['|' '#'])) as s + { jbuild_atom (if acc = "" then s else acc ^ s) start lexbuf + } + | "" + { if acc = "" then + error lexbuf "Internal error in the S-expression parser, \ + please report upstream."; + lexbuf.lex_start_p <- start; + Token.Atom (Atom.of_string acc) + } + +and quoted_string mode = parse + | '"' + { Buffer.contents escaped_buf } + | '\\' + { match escape_sequence mode lexbuf with + | Newline -> quoted_string_after_escaped_newline mode lexbuf + | Other -> quoted_string mode lexbuf + } + | newline as s + { Lexing.new_line lexbuf; + Buffer.add_string escaped_buf s; + quoted_string mode lexbuf + } + | _ as c + { Buffer.add_char escaped_buf c; + quoted_string mode lexbuf + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated quoted string"; + Buffer.contents escaped_buf + } + +and quoted_string_after_escaped_newline mode = parse + | [' ' '\t']* + { quoted_string mode lexbuf } + +and jbuild_block_comment = parse + | '"' + { Buffer.clear escaped_buf; + ignore (quoted_string In_block_comment lexbuf : string); + jbuild_block_comment lexbuf + } + | "|#" + { () + } + | eof + { error lexbuf "unterminated block comment" + } + | _ + { jbuild_block_comment lexbuf + } + +and escape_sequence mode = parse + | newline + { Lexing.new_line lexbuf; + Newline } + | ['\\' '\'' '"' 'n' 't' 'b' 'r'] as c + { let c = + match c with + | 'n' -> '\n' + | 'r' -> '\r' + | 'b' -> '\b' + | 't' -> '\t' + | _ -> c + in + Buffer.add_char escaped_buf c; + Other + } + | (digit as c1) (digit as c2) (digit as c3) + { let v = eval_decimal_escape c1 c2 c3 in + if mode <> In_block_comment && v > 255 then + error lexbuf "escape sequence in quoted string out of range" + ~delta:(-1); + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | digit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated decimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | 'x' (hexdigit as c1) (hexdigit as c2) + { let v = eval_hex_escape c1 c2 in + Buffer.add_char escaped_buf (Char.chr v); + Other + } + | 'x' hexdigit* as s + { if mode <> In_block_comment then + error lexbuf "unterminated hexadecimal escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_string escaped_buf s; + Other + } + | _ as c + { if mode = New_syntax then + error lexbuf "unknown escape sequence" ~delta:(-1); + Buffer.add_char escaped_buf '\\'; + Buffer.add_char escaped_buf c; + Other + } + | eof + { if mode <> In_block_comment then + error lexbuf "unterminated escape sequence" ~delta:(-1); + Other + } diff --git a/src/usexp/lexer.ml b/src/usexp/lexer.ml new file mode 100644 index 00000000..aff4e6d2 --- /dev/null +++ b/src/usexp/lexer.ml @@ -0,0 +1,4 @@ +include Lexer0 + +let token = Dune_lexer.token +let jbuild_token = Jbuild_lexer.token diff --git a/src/usexp/lexer0.ml b/src/usexp/lexer0.ml new file mode 100644 index 00000000..5ebc545a --- /dev/null +++ b/src/usexp/lexer0.ml @@ -0,0 +1,68 @@ + +module Token = struct + type t = + | Atom of Atom.t + | Quoted_string of string + | Lparen + | Rparen + | Sexp_comment + | Eof +end + +type t = Lexing.lexbuf -> Token.t + +module Error = struct + type t = + { start : Lexing.position + ; stop : Lexing.position + ; message : string + } +end + +exception Error of Error.t + +let error ?(delta=0) lexbuf message = + let start = Lexing.lexeme_start_p lexbuf in + raise + (Error { start = { start with pos_cnum = start.pos_cnum + delta } + ; stop = Lexing.lexeme_end_p lexbuf + ; message + }) + +let escaped_buf = Buffer.create 256 + +(* The difference between the old and new syntax is that the old + syntax allows backslash following by any characters other than 'n', + 'x', ... and interpret it as it. The new syntax is stricter in + order to allow introducing new escape sequence in the future if + needed. *) +type escape_mode = + | In_block_comment (* Inside #|...|# comments (old syntax) *) + | Old_syntax + | New_syntax + +type escape_sequence = + | Newline + | Other + +type block_string_line_kind = + | With_escape_sequences + | Raw + +let eval_decimal_char c = Char.code c - Char.code '0' + +let eval_decimal_escape c1 c2 c3 = + (eval_decimal_char c1) * 100 + + (eval_decimal_char c2) * 10 + + (eval_decimal_char c3) + +let eval_hex_char c = + match c with + | '0'..'9' -> Char.code c - Char.code '0' + | 'a'..'f' -> Char.code c - Char.code 'a' + 10 + | 'A'..'F' -> Char.code c - Char.code 'A' + 10 + | _ -> -1 + +let eval_hex_escape c1 c2 = + (eval_hex_char c1) * 16 + + (eval_hex_char c2) diff --git a/src/usexp/lexer0.mli b/src/usexp/lexer0.mli new file mode 100644 index 00000000..fc15c06a --- /dev/null +++ b/src/usexp/lexer0.mli @@ -0,0 +1,44 @@ +module Token : sig + type t = + | Atom of Atom.t + | Quoted_string of string + | Lparen + | Rparen + | Sexp_comment + | Eof +end + +type t = Lexing.lexbuf -> Token.t + +module Error : sig + type t = + { start : Lexing.position + ; stop : Lexing.position + ; message : string + } +end + +val error : ?delta:int -> Lexing.lexbuf -> string -> 'a + +val escaped_buf : Buffer.t + +exception Error of Error.t + +type escape_mode = + | In_block_comment (* Inside #|...|# comments (old syntax) *) + | Old_syntax + | New_syntax + +type escape_sequence = + | Newline + | Other + +type block_string_line_kind = + | With_escape_sequences + | Raw + +val eval_decimal_char : char -> int + +val eval_decimal_escape : char -> char -> char -> int + +val eval_hex_escape : char -> char -> int