Skip to content

Commit b7fafbb

Browse files
hyperpolymathclaude
andcommitted
fix(parser): split fused >> when closing nested applied generics (#131)
The lexer emits ">>" as a single GTGT token (the right-shift operator). In nested applied generics — `Option<Result<T, E>>` — the inner and outer closing '>' fuse into that one GTGT, and the LR grammar (which expects a single GT to close each type-argument list) could not proceed. This blocked the entire stdlib surface, not just string.affine (#128). Route Parse_driver's file/expr entry points through Menhir's incremental API and split a GTGT into two GT tokens *driven by the parser's own state* (`I.acceptable`): a split happens only when, in the current state, GT is acceptable but GTGT is not — i.e. we are closing a type-argument list, where the shift operator is ungrammatical. Expression `a >> b` is untouched (there GTGT is acceptable). ">>>" lexes as GTGT then GT and composes via one split plus the trailing GT. No grammar, lexer, or expression-semantics changes. Adds focused regression tests (nested >>, >>>, return-position, plus a shift-operator non-regression guard). Full suite green (218 tests). Closes #131 Refs #128 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent dab2f39 commit b7fafbb

2 files changed

Lines changed: 112 additions & 76 deletions

File tree

lib/parse_driver.ml

Lines changed: 83 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -6,40 +6,17 @@
66
(** Exception for parse errors *)
77
exception Parse_error of string * Span.t
88

9+
module I = Parser.MenhirInterpreter
10+
911
(** Buffered token stream that provides Menhir-compatible interface *)
1012
type token_buffer = {
1113
mutable current_token : Token.t;
1214
mutable current_span : Span.t;
1315
mutable next_token : unit -> Token.t * Span.t;
1416
}
1517

16-
(** Create a Menhir-compatible lexer function from our token stream *)
17-
let lexer_of_token_stream (next : unit -> Token.t * Span.t) : Lexing.lexbuf -> Parser.token =
18-
(* We need to track position for Menhir *)
19-
let buf = ref None in
20-
let get_next () =
21-
match !buf with
22-
| Some (tok, span) ->
23-
buf := None;
24-
(tok, span)
25-
| None -> next ()
26-
in
27-
fun lexbuf ->
28-
let (tok, span) = get_next () in
29-
(* Update lexbuf positions for Menhir *)
30-
lexbuf.Lexing.lex_start_p <- {
31-
Lexing.pos_fname = span.Span.file;
32-
pos_lnum = span.start_pos.line;
33-
pos_bol = span.start_pos.offset - span.start_pos.col + 1;
34-
pos_cnum = span.start_pos.offset;
35-
};
36-
lexbuf.Lexing.lex_curr_p <- {
37-
Lexing.pos_fname = span.Span.file;
38-
pos_lnum = span.end_pos.line;
39-
pos_bol = span.end_pos.offset - span.end_pos.col + 1;
40-
pos_cnum = span.end_pos.offset;
41-
};
42-
(* Convert our token type to Menhir's *)
18+
(** Convert our token type to Menhir's. *)
19+
let to_menhir_token (tok : Token.t) : Parser.token =
4320
match tok with
4421
| Token.INT n -> Parser.INT n
4522
| Token.FLOAT f -> Parser.FLOAT f
@@ -146,27 +123,82 @@ let lexer_of_token_stream (next : unit -> Token.t * Span.t) : Lexing.lexbuf -> P
146123
| Token.ROW_VAR s -> Parser.ROW_VAR s
147124
| Token.EOF -> Parser.EOF
148125

149-
(** Parse a program from a string *)
150-
let parse_string ~file content =
151-
let token_stream = Lexer.from_string ~file content in
152-
let lexbuf = Lexing.from_string content in
153-
lexbuf.Lexing.lex_curr_p <- { lexbuf.Lexing.lex_curr_p with pos_fname = file };
154-
let lexer = lexer_of_token_stream token_stream in
155-
try
156-
Parser.program lexer lexbuf
126+
(** Convert a Span position to a Menhir/Lexing position. *)
127+
let lexing_pos ~file (p : Span.pos) : Lexing.position =
128+
{ Lexing.pos_fname = file;
129+
pos_lnum = p.Span.line;
130+
pos_bol = p.offset - p.col + 1;
131+
pos_cnum = p.offset }
132+
133+
(** Create a Menhir-compatible lexer function from our token stream.
134+
Retained for any monolithic-API caller; the file/expr entry points
135+
below use the incremental driver instead. *)
136+
let lexer_of_token_stream (next : unit -> Token.t * Span.t) : Lexing.lexbuf -> Parser.token =
137+
let buf = ref None in
138+
let get_next () =
139+
match !buf with
140+
| Some (tok, span) -> buf := None; (tok, span)
141+
| None -> next ()
142+
in
143+
fun lexbuf ->
144+
let (tok, span) = get_next () in
145+
lexbuf.Lexing.lex_start_p <- lexing_pos ~file:span.Span.file span.start_pos;
146+
lexbuf.Lexing.lex_curr_p <- lexing_pos ~file:span.Span.file span.end_pos;
147+
to_menhir_token tok
148+
149+
(* The lexer emits ">>" as a single GTGT token (the right-shift operator).
150+
In nested applied generics — `Option<Result<T, E>>` — the inner and outer
151+
closing '>' fuse into that one GTGT, and the LR grammar, which expects a
152+
single GT to close each type-argument list, cannot proceed (issue #131).
153+
154+
Real LR compilers (rustc, Roslyn) solve this by splitting the token at
155+
parse time, *driven by the parser's own state* rather than a lexical
156+
guess: a GTGT is re-read as two GT tokens only when, in the current state,
157+
the parser would accept GT but not GTGT — i.e. we are closing a type-
158+
argument list, where the shift operator is not grammatical. Expression
159+
`a >> b` is untouched: there GTGT is acceptable, so no split happens.
160+
`>>>` lexes as GTGT then GT and is handled by one split plus the trailing
161+
GT; deeper nestings compose the same way. This requires the incremental
162+
API so we can interrogate the parser via [I.acceptable]. *)
163+
let drive (type a) ~file
164+
(start : Lexing.position -> a I.checkpoint)
165+
(next : unit -> Token.t * Span.t) : a =
166+
let pending : (Parser.token * Lexing.position * Lexing.position) option ref =
167+
ref None in
168+
let last_span = ref Span.dummy in
169+
let next_triple () =
170+
match !pending with
171+
| Some t -> pending := None; t
172+
| None ->
173+
let (tok, span) = next () in
174+
last_span := span;
175+
let sp = lexing_pos ~file:span.Span.file span.start_pos in
176+
let ep = lexing_pos ~file:span.Span.file span.end_pos in
177+
(to_menhir_token tok, sp, ep)
178+
in
179+
let rec run (cp : a I.checkpoint) =
180+
match cp with
181+
| I.InputNeeded _ ->
182+
let (ptok, sp, ep) = next_triple () in
183+
let triple =
184+
match ptok with
185+
| Parser.GTGT
186+
when (not (I.acceptable cp Parser.GTGT sp))
187+
&& I.acceptable cp Parser.GT sp ->
188+
let mid = { sp with Lexing.pos_cnum = sp.Lexing.pos_cnum + 1 } in
189+
pending := Some (Parser.GT, mid, ep);
190+
(Parser.GT, sp, mid)
191+
| _ -> (ptok, sp, ep)
192+
in
193+
run (I.offer cp triple)
194+
| I.Shifting _ | I.AboutToReduce _ -> run (I.resume cp)
195+
| I.HandlingError _ | I.Rejected -> raise Parser.Error
196+
| I.Accepted v -> v
197+
in
198+
try run (start (lexing_pos ~file { Span.line = 1; col = 1; offset = 0 }))
157199
with
158200
| Parser.Error ->
159-
let pos = lexbuf.Lexing.lex_curr_p in
160-
let span = Span.make
161-
~file
162-
~start_pos:{ Span.line = pos.pos_lnum;
163-
col = pos.pos_cnum - pos.pos_bol + 1;
164-
offset = pos.pos_cnum }
165-
~end_pos:{ Span.line = pos.pos_lnum;
166-
col = pos.pos_cnum - pos.pos_bol + 1;
167-
offset = pos.pos_cnum }
168-
in
169-
raise (Parse_error ("Syntax error", span))
201+
raise (Parse_error ("Syntax error", !last_span))
170202
| Parser_errors.Parse_action_error (msg, startpos, endpos) ->
171203
let span = Span.make
172204
~file
@@ -179,6 +211,10 @@ let parse_string ~file content =
179211
in
180212
raise (Parse_error (msg, span))
181213

214+
(** Parse a program from a string *)
215+
let parse_string ~file content =
216+
drive ~file Parser.Incremental.program (Lexer.from_string ~file content)
217+
182218
(** Parse a program from a file *)
183219
let parse_file filename =
184220
let chan = open_in_bin filename in
@@ -190,33 +226,4 @@ let parse_file filename =
190226

191227
(** Parse a single expression from a string *)
192228
let parse_expr ~file content =
193-
let token_stream = Lexer.from_string ~file content in
194-
let lexbuf = Lexing.from_string content in
195-
lexbuf.Lexing.lex_curr_p <- { lexbuf.Lexing.lex_curr_p with pos_fname = file };
196-
let lexer = lexer_of_token_stream token_stream in
197-
try
198-
Parser.expr_only lexer lexbuf
199-
with
200-
| Parser.Error ->
201-
let pos = lexbuf.Lexing.lex_curr_p in
202-
let span = Span.make
203-
~file
204-
~start_pos:{ Span.line = pos.pos_lnum;
205-
col = pos.pos_cnum - pos.pos_bol + 1;
206-
offset = pos.pos_cnum }
207-
~end_pos:{ Span.line = pos.pos_lnum;
208-
col = pos.pos_cnum - pos.pos_bol + 1;
209-
offset = pos.pos_cnum }
210-
in
211-
raise (Parse_error ("Syntax error", span))
212-
| Parser_errors.Parse_action_error (msg, startpos, endpos) ->
213-
let span = Span.make
214-
~file
215-
~start_pos:{ Span.line = startpos.Lexing.pos_lnum;
216-
col = startpos.pos_cnum - startpos.pos_bol + 1;
217-
offset = startpos.pos_cnum }
218-
~end_pos:{ Span.line = endpos.Lexing.pos_lnum;
219-
col = endpos.pos_cnum - endpos.pos_bol + 1;
220-
offset = endpos.pos_cnum }
221-
in
222-
raise (Parse_error (msg, span))
229+
drive ~file Parser.Incremental.expr_only (Lexer.from_string ~file content)

test/test_e2e.ml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3267,12 +3267,41 @@ let test_tuple_type_still_works () =
32673267
(parse_check_passes
32683268
{|fn first(t: (Int, String)) -> Int { return 0; }|})
32693269

3270+
(* Issue #131: nested applied generics whose closing '>'s fuse into a single
3271+
">>" (or ">>>") token must still parse — the lexer emits GTGT but the
3272+
driver re-splits it when closing a type-argument list. *)
3273+
let test_angle_nested_gtgt () =
3274+
Alcotest.(check bool) "Option<Result<T, E>> (>> close) parses + typechecks" true
3275+
(parse_check_passes
3276+
{|fn f(o: Option<Result<Int, String>>) -> Int { return 0; }|})
3277+
3278+
let test_angle_nested_gtgtgt () =
3279+
Alcotest.(check bool) "Option<Option<Result<T, E>>> (>>> close) parses" true
3280+
(parse_check_passes
3281+
{|fn f(o: Option<Option<Result<Int, String>>>) -> Int { return 0; }|})
3282+
3283+
let test_angle_nested_return_pos () =
3284+
Alcotest.(check bool) "-> Result<Option<T>, E> nested in return position" true
3285+
(parse_check_passes
3286+
{|fn f() -> Result<Option<Int>, String> { return Ok(None); }|})
3287+
3288+
(* Non-regression: a real right-shift expression must still be one GTGT,
3289+
not split, since GTGT is grammatical there. *)
3290+
let test_shift_operator_not_split () =
3291+
Alcotest.(check bool) "a >> b right-shift expression unaffected by #131 fix" true
3292+
(parse_check_passes
3293+
{|fn sh(a: Int, b: Int) -> Int { return a >> b; }|})
3294+
32703295
let type_syntax_sugar_tests = [
32713296
Alcotest.test_case "fn() -> T (zero-arg fn type)" `Quick test_fn_type_zero_arg;
32723297
Alcotest.test_case "fn(A, B) -> T (multi-arg fn type)" `Quick test_fn_type_multi_arg;
32733298
Alcotest.test_case "Option<T> (angle brackets, type app)" `Quick test_angle_brackets_type_app;
32743299
Alcotest.test_case "Result<T, E> (angle brackets, 2 args)" `Quick test_angle_brackets_two_args;
32753300
Alcotest.test_case "fn f<T> (angle brackets, type params)" `Quick test_angle_brackets_type_params;
3301+
Alcotest.test_case "Option<Result<T,E>> (#131 >> close)" `Quick test_angle_nested_gtgt;
3302+
Alcotest.test_case "Option<Option<Result<T,E>>> (#131 >>>)" `Quick test_angle_nested_gtgtgt;
3303+
Alcotest.test_case "-> Result<Option<T>,E> (#131 nested)" `Quick test_angle_nested_return_pos;
3304+
Alcotest.test_case "a >> b shift unaffected (#131 guard)" `Quick test_shift_operator_not_split;
32763305
Alcotest.test_case "(A, B) -> C (multi-arg arrow)" `Quick test_multi_arg_arrow;
32773306
Alcotest.test_case "(A, B) without arrow remains tuple" `Quick test_tuple_type_still_works;
32783307
]

0 commit comments

Comments
 (0)