Skip to content

Commit b5f3f7c

Browse files
kddnewtonmatzbot
authored andcommitted
[ruby/prism] Refine continuable? with algorithm in C
ruby/prism@c28810fe93
1 parent a2b9c8a commit b5f3f7c

9 files changed

Lines changed: 330 additions & 103 deletions

File tree

lib/prism/lex_compat.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ class Result < Prism::Result
4343

4444
# Create a new lex compat result object with the given values.
4545
#--
46-
#: (Array[lex_compat_token] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, Source source) -> void
47-
def initialize(value, comments, magic_comments, data_loc, errors, warnings, source)
46+
#: (Array[lex_compat_token] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void
47+
def initialize(value, comments, magic_comments, data_loc, errors, warnings, continuable, source)
4848
@value = value
49-
super(comments, magic_comments, data_loc, errors, warnings, source)
49+
super(comments, magic_comments, data_loc, errors, warnings, continuable, source)
5050
end
5151

5252
# Implement the hash pattern matching interface for Result.
@@ -825,7 +825,7 @@ def result
825825

826826
tokens = post_process_tokens(tokens, source, result.data_loc, bom, eof_token)
827827

828-
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
828+
Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, result.continuable?, source)
829829
end
830830

831831
private

lib/prism/parse_result.rb

Lines changed: 14 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -898,13 +898,14 @@ class Result
898898

899899
# Create a new result object with the given values.
900900
#--
901-
#: (Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, Source source) -> void
902-
def initialize(comments, magic_comments, data_loc, errors, warnings, source)
901+
#: (Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void
902+
def initialize(comments, magic_comments, data_loc, errors, warnings, continuable, source)
903903
@comments = comments
904904
@magic_comments = magic_comments
905905
@data_loc = data_loc
906906
@errors = errors
907907
@warnings = warnings
908+
@continuable = continuable
908909
@source = source
909910
end
910911

@@ -961,54 +962,8 @@ def failure?
961962
#--
962963
#: () -> bool
963964
def continuable?
964-
return false if errors.empty?
965-
966-
offset = source.source.bytesize
967-
errors.all? { |error| CONTINUABLE.include?(error.type) || error.location.start_offset == offset }
968-
end
969-
970-
# The set of error types whose location the parser places at the opening
971-
# token of an unclosed construct rather than at the end of the source. These
972-
# errors always indicate incomplete input regardless of their byte position,
973-
# so they are checked by type rather than by location.
974-
#--
975-
#: Array[Symbol]
976-
CONTINUABLE = %i[
977-
begin_term
978-
begin_upcase_term
979-
block_param_pipe_term
980-
block_term_brace
981-
block_term_end
982-
case_missing_conditions
983-
case_term
984-
class_term
985-
conditional_term
986-
conditional_term_else
987-
def_term
988-
embdoc_term
989-
end_upcase_term
990-
for_term
991-
hash_term
992-
heredoc_term
993-
lambda_term_brace
994-
lambda_term_end
995-
list_i_lower_term
996-
list_i_upper_term
997-
list_w_lower_term
998-
list_w_upper_term
999-
module_term
1000-
regexp_term
1001-
rescue_term
1002-
string_interpolated_term
1003-
string_literal_eof
1004-
symbol_term_dynamic
1005-
symbol_term_interpolated
1006-
until_term
1007-
while_term
1008-
xstring_term
1009-
].freeze
1010-
1011-
private_constant :CONTINUABLE
965+
@continuable
966+
end
1012967

1013968
# Create a code units cache for the given encoding.
1014969
#--
@@ -1033,10 +988,10 @@ class ParseResult < Result
1033988

1034989
# Create a new parse result object with the given values.
1035990
#--
1036-
#: (ProgramNode value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, Source source) -> void
1037-
def initialize(value, comments, magic_comments, data_loc, errors, warnings, source)
991+
#: (ProgramNode value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void
992+
def initialize(value, comments, magic_comments, data_loc, errors, warnings, continuable, source)
1038993
@value = value
1039-
super(comments, magic_comments, data_loc, errors, warnings, source)
994+
super(comments, magic_comments, data_loc, errors, warnings, continuable, source)
1040995
end
1041996

1042997
# Implement the hash pattern matching interface for ParseResult.
@@ -1077,10 +1032,10 @@ class LexResult < Result
10771032

10781033
# Create a new lex result object with the given values.
10791034
#--
1080-
#: (Array[[Token, Integer]] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, Source source) -> void
1081-
def initialize(value, comments, magic_comments, data_loc, errors, warnings, source)
1035+
#: (Array[[Token, Integer]] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void
1036+
def initialize(value, comments, magic_comments, data_loc, errors, warnings, continuable, source)
10821037
@value = value
1083-
super(comments, magic_comments, data_loc, errors, warnings, source)
1038+
super(comments, magic_comments, data_loc, errors, warnings, continuable, source)
10841039
end
10851040

10861041
# Implement the hash pattern matching interface for LexResult.
@@ -1099,10 +1054,10 @@ class ParseLexResult < Result
10991054

11001055
# Create a new parse lex result object with the given values.
11011056
#--
1102-
#: ([ProgramNode, Array[[Token, Integer]]] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, Source source) -> void
1103-
def initialize(value, comments, magic_comments, data_loc, errors, warnings, source)
1057+
#: ([ProgramNode, Array[[Token, Integer]]] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void
1058+
def initialize(value, comments, magic_comments, data_loc, errors, warnings, continuable, source)
11041059
@value = value
1105-
super(comments, magic_comments, data_loc, errors, warnings, source)
1060+
super(comments, magic_comments, data_loc, errors, warnings, continuable, source)
11061061
end
11071062

11081063
# Implement the hash pattern matching interface for ParseLexResult.

prism/extension.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -641,10 +641,11 @@ parse_result_create(VALUE class, const pm_parser_t *parser, VALUE value, rb_enco
641641
parser_data_loc(parser, source, freeze),
642642
parser_errors(parser, encoding, source, freeze),
643643
parser_warnings(parser, encoding, source, freeze),
644+
parser->continuable ? Qtrue : Qfalse,
644645
source
645646
};
646647

647-
return rb_class_new_instance_freeze(7, result_argv, class, freeze);
648+
return rb_class_new_instance_freeze(8, result_argv, class, freeze);
648649
}
649650

650651
/******************************************************************************/

prism/parser.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,14 @@ struct pm_parser {
895895
/** Whether or not we're currently recovering from a syntax error. */
896896
bool recovering;
897897

898+
/**
899+
* Whether or not the source being parsed could become valid if more input
900+
* were appended. This is set to false when the parser encounters a token
901+
* that is definitively wrong (e.g., a stray `end` or `]`) as opposed to
902+
* merely incomplete.
903+
*/
904+
bool continuable;
905+
898906
/**
899907
* This is very specialized behavior for when you want to parse in a context
900908
* that does not respect encoding comments. Its main use case is translating

prism/prism.c

Lines changed: 166 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22016,6 +22016,7 @@ pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, si
2201622016
.partial_script = false,
2201722017
.command_start = true,
2201822018
.recovering = false,
22019+
.continuable = true,
2201922020
.encoding_locked = false,
2202022021
.encoding_changed = false,
2202122022
.pattern_matching_newlines = false,
@@ -22292,12 +22293,176 @@ pm_parser_free(pm_parser_t *parser) {
2229222293
}
2229322294
}
2229422295

22296+
/**
22297+
* Returns true if the given diagnostic ID represents an error that cannot be
22298+
* fixed by appending more input. These are errors where the existing source
22299+
* contains definitively invalid syntax (as opposed to merely incomplete input).
22300+
*/
22301+
static bool
22302+
pm_parse_err_is_fatal(pm_diagnostic_id_t diag_id) {
22303+
switch (diag_id) {
22304+
case PM_ERR_ARRAY_EXPRESSION_AFTER_STAR:
22305+
case PM_ERR_BEGIN_UPCASE_BRACE:
22306+
case PM_ERR_CLASS_VARIABLE_BARE:
22307+
case PM_ERR_END_UPCASE_BRACE:
22308+
case PM_ERR_ESCAPE_INVALID_HEXADECIMAL:
22309+
case PM_ERR_ESCAPE_INVALID_UNICODE_LIST:
22310+
case PM_ERR_ESCAPE_INVALID_UNICODE_SHORT:
22311+
case PM_ERR_EXPRESSION_NOT_WRITABLE:
22312+
case PM_ERR_EXPRESSION_NOT_WRITABLE_SELF:
22313+
case PM_ERR_FLOAT_PARSE:
22314+
case PM_ERR_GLOBAL_VARIABLE_BARE:
22315+
case PM_ERR_HASH_KEY:
22316+
case PM_ERR_HEREDOC_IDENTIFIER:
22317+
case PM_ERR_INSTANCE_VARIABLE_BARE:
22318+
case PM_ERR_INVALID_BLOCK_EXIT:
22319+
case PM_ERR_INVALID_ENCODING_MAGIC_COMMENT:
22320+
case PM_ERR_INVALID_FLOAT_EXPONENT:
22321+
case PM_ERR_INVALID_NUMBER_BINARY:
22322+
case PM_ERR_INVALID_NUMBER_DECIMAL:
22323+
case PM_ERR_INVALID_NUMBER_HEXADECIMAL:
22324+
case PM_ERR_INVALID_NUMBER_OCTAL:
22325+
case PM_ERR_INVALID_NUMBER_UNDERSCORE_TRAILING:
22326+
case PM_ERR_NO_LOCAL_VARIABLE:
22327+
case PM_ERR_PARAMETER_ORDER:
22328+
case PM_ERR_STATEMENT_UNDEF:
22329+
case PM_ERR_VOID_EXPRESSION:
22330+
return true;
22331+
default:
22332+
return false;
22333+
}
22334+
}
22335+
22336+
/**
22337+
* Determine whether the source parsed by the given parser could become valid if
22338+
* more input were appended. This is used by tools like IRB to decide whether to
22339+
* prompt for continuation or to display an error.
22340+
*
22341+
* The parser starts with continuable=true. This function scans all errors to
22342+
* detect two categories of non-continuable errors:
22343+
*
22344+
* 1. Fatal errors: errors like invalid number literals or bare global variables
22345+
* that indicate definitively invalid syntax. These are only considered fatal
22346+
* if they occur before EOF (at EOF they could be from truncated input, e.g.
22347+
* `"\x` is an incomplete hex escape).
22348+
*
22349+
* 2. Stray tokens: unexpected_token_ignore and unexpected_token_close_context
22350+
* errors indicate tokens that don't belong. A stray token is a cascade
22351+
* effect (and does not prevent continuability) if:
22352+
*
22353+
* a. A non-stray, non-fatal error appeared earlier in the error list at a
22354+
* strictly earlier source position (the stray was caused by a preceding
22355+
* parse failure, e.g. a truncated heredoc), OR
22356+
* b. The stray token is at EOF, starts after position 0 (there is valid
22357+
* code before it), and either is a single byte (likely a truncated
22358+
* token like `\`) or there are non-stray errors elsewhere.
22359+
*
22360+
* Closing delimiters (`)`, `]`, `}`) at EOF are always genuinely stray —
22361+
* they are complete tokens and cannot become part of a longer valid
22362+
* construct by appending more input.
22363+
*
22364+
* c. The stray token is `=` at the start of a line, which could be the
22365+
* beginning of `=begin` (an embedded document). The remaining bytes
22366+
* after `=` may parse as an identifier, so the error is not at EOF,
22367+
* but the construct is genuinely incomplete.
22368+
*/
22369+
static void
22370+
pm_parse_continuable(pm_parser_t *parser) {
22371+
// If there are no errors then there is nothing to continue.
22372+
if (parser->error_list.size == 0) {
22373+
parser->continuable = false;
22374+
return;
22375+
}
22376+
22377+
if (!parser->continuable) return;
22378+
22379+
size_t source_length = (size_t) (parser->end - parser->start);
22380+
22381+
// First pass: check if there are any non-stray, non-fatal errors.
22382+
bool has_non_stray_error = false;
22383+
for (pm_diagnostic_t *error = (pm_diagnostic_t *) parser->error_list.head; error != NULL; error = (pm_diagnostic_t *) error->node.next) {
22384+
if (error->diag_id != PM_ERR_UNEXPECTED_TOKEN_IGNORE && error->diag_id != PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT && !pm_parse_err_is_fatal(error->diag_id)) {
22385+
has_non_stray_error = true;
22386+
break;
22387+
}
22388+
}
22389+
22390+
// Second pass: check each error. We track the minimum source position
22391+
// among non-stray, non-fatal errors seen so far in list order, which
22392+
// lets us detect cascade stray tokens.
22393+
size_t non_stray_min_start = SIZE_MAX;
22394+
22395+
for (pm_diagnostic_t *error = (pm_diagnostic_t *) parser->error_list.head; error != NULL; error = (pm_diagnostic_t *) error->node.next) {
22396+
size_t error_start = (size_t) error->location.start;
22397+
size_t error_end = error_start + (size_t) error->location.length;
22398+
bool at_eof = error_end >= source_length;
22399+
22400+
// Fatal errors are non-continuable unless they occur at EOF.
22401+
if (pm_parse_err_is_fatal(error->diag_id) && !at_eof) {
22402+
parser->continuable = false;
22403+
return;
22404+
}
22405+
22406+
// Track non-stray, non-fatal error positions in list order.
22407+
if (error->diag_id != PM_ERR_UNEXPECTED_TOKEN_IGNORE &&
22408+
error->diag_id != PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT) {
22409+
if (error_start < non_stray_min_start) non_stray_min_start = error_start;
22410+
continue;
22411+
}
22412+
22413+
// This is a stray token. Determine if it is a cascade effect
22414+
// of a preceding error or genuinely stray.
22415+
22416+
// Rule (a): a non-stray error was seen earlier in the list at a
22417+
// strictly earlier position — this stray is a cascade effect.
22418+
if (non_stray_min_start < error_start) continue;
22419+
22420+
// Rule (b): this stray is at EOF with valid code before it.
22421+
// Single-byte stray tokens at EOF (like `\` for line continuation)
22422+
// are likely truncated tokens. Multi-byte stray tokens (like the
22423+
// keyword `end`) need additional evidence that they are cascade
22424+
// effects (i.e. non-stray errors exist elsewhere).
22425+
if (at_eof && error_start > 0) {
22426+
// Exception: closing delimiters at EOF are genuinely stray.
22427+
if (error->location.length == 1) {
22428+
const uint8_t *byte = parser->start + error_start;
22429+
if (*byte == ')' || *byte == ']' || *byte == '}') {
22430+
parser->continuable = false;
22431+
return;
22432+
}
22433+
22434+
// Single-byte non-delimiter stray at EOF: cascade.
22435+
continue;
22436+
}
22437+
22438+
// Multi-byte stray at EOF: cascade only if there are
22439+
// non-stray errors (evidence of a preceding parse failure).
22440+
if (has_non_stray_error) continue;
22441+
}
22442+
22443+
// Rule (c): a stray `=` at the start of a line could be the
22444+
// beginning of an embedded document (`=begin`). The remaining
22445+
// bytes after `=` parse as an identifier, so the error is not
22446+
// at EOF, but the construct is genuinely incomplete.
22447+
if (error->location.length == 1) {
22448+
const uint8_t *byte = parser->start + error_start;
22449+
if (*byte == '=' && (error_start == 0 || *(byte - 1) == '\n')) continue;
22450+
}
22451+
22452+
// This stray token is genuinely non-continuable.
22453+
parser->continuable = false;
22454+
return;
22455+
}
22456+
}
22457+
2229522458
/**
2229622459
* Parse the Ruby source associated with the given parser and return the tree.
2229722460
*/
2229822461
PRISM_EXPORTED_FUNCTION pm_node_t *
2229922462
pm_parse(pm_parser_t *parser) {
22300-
return parse_program(parser);
22463+
pm_node_t *node = parse_program(parser);
22464+
pm_parse_continuable(parser);
22465+
return node;
2230122466
}
2230222467

2230322468
/**

0 commit comments

Comments
 (0)