From a70dbe1065f58a23a85e576d19e3434e6dd4bb64 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 12 Mar 2026 13:27:40 -0400 Subject: [PATCH 1/7] Use Prism for mk_builtin_loader.rb --- .github/actions/setup/directories/action.yml | 1 + .github/workflows/wasm.yml | 1 + common.mk | 24 +- configure.ac | 6 + defs/gmake.mk | 4 + depend | 24 ++ prism/templates/src/node.c.erb | 4 + template/Makefile.in | 1 + tool/dump_ast.c | 64 +++ tool/mk_builtin_loader.rb | 390 +++++++++---------- tool/prereq.status | 1 + win32/Makefile.sub | 2 + 12 files changed, 310 insertions(+), 212 deletions(-) create mode 100644 tool/dump_ast.c diff --git a/.github/actions/setup/directories/action.yml b/.github/actions/setup/directories/action.yml index aeaa8d9783ba68..6b569d66ac7c8d 100644 --- a/.github/actions/setup/directories/action.yml +++ b/.github/actions/setup/directories/action.yml @@ -145,6 +145,7 @@ runs: run: | rm -f config.status .rbconfig.time \ Makefile GNUmakefile uncommon.mk enc.mk noarch-fake.rb + rm -f prism/.time prism/util/.time - if: steps.which.outputs.sudo shell: bash diff --git a/.github/workflows/wasm.yml b/.github/workflows/wasm.yml index 9eb7976a613832..e77336225a628c 100644 --- a/.github/workflows/wasm.yml +++ b/.github/workflows/wasm.yml @@ -124,6 +124,7 @@ jobs: ../src/configure \ --host wasm32-unknown-wasi \ --with-baseruby=$PWD/../baseruby/install/bin/ruby \ + --with-dump-ast=$PWD/../baseruby/dump_ast \ --with-static-linked-ext \ --with-ext=cgi/escape,continuation,coverage,date,digest/bubblebabble,digest,digest/md5,digest/rmd160,digest/sha1,digest/sha2,etc,fcntl,json,json/generator,json/parser,objspace,pathname,rbconfig/sizeof,ripper,stringio,strscan,monitor \ LDFLAGS=" \ diff --git a/common.mk b/common.mk index 374b722349c3f1..4b8791efe0d731 100644 --- a/common.mk +++ b/common.mk @@ -90,10 +90,8 @@ MAKE_ENC = -f $(ENC_MK) V="$(V)" UNICODE_HDR_DIR="$(UNICODE_HDR_DIR)" \ PRISM_BUILD_DIR = prism -PRISM_FILES = prism/api_node.$(OBJEXT) \ - prism/diagnostic.$(OBJEXT) \ +LIBPRISM_OBJS = prism/diagnostic.$(OBJEXT) \ prism/encoding.$(OBJEXT) \ - prism/extension.$(OBJEXT) \ prism/node.$(OBJEXT) \ prism/options.$(OBJEXT) \ prism/prettyprint.$(OBJEXT) \ @@ -112,9 +110,14 @@ PRISM_FILES = prism/api_node.$(OBJEXT) \ prism/util/pm_string.$(OBJEXT) \ prism/util/pm_strncasecmp.$(OBJEXT) \ prism/util/pm_strpbrk.$(OBJEXT) \ - prism/prism.$(OBJEXT) \ + prism/prism.$(OBJEXT) + +EXTPRISM_OBJS = prism/api_node.$(OBJEXT) \ + prism/extension.$(OBJEXT) \ prism_init.$(OBJEXT) +PRISM_OBJS = $(LIBPRISM_OBJS) $(EXTPRISM_OBJS) + COMMONOBJS = \ array.$(OBJEXT) \ ast.$(OBJEXT) \ @@ -192,7 +195,7 @@ COMMONOBJS = \ vm_sync.$(OBJEXT) \ vm_trace.$(OBJEXT) \ weakmap.$(OBJEXT) \ - $(PRISM_FILES) \ + $(PRISM_OBJS) \ $(YJIT_OBJ) \ $(ZJIT_OBJ) \ $(JIT_OBJ) \ @@ -203,7 +206,7 @@ COMMONOBJS = \ $(BUILTIN_TRANSOBJS) \ $(MISSING) -$(PRISM_FILES): $(PRISM_BUILD_DIR)/.time $(PRISM_BUILD_DIR)/util/.time +$(PRISM_OBJS): $(PRISM_BUILD_DIR)/.time $(PRISM_BUILD_DIR)/util/.time $(PRISM_BUILD_DIR)/.time $(PRISM_BUILD_DIR)/util/.time: $(Q) $(MAKEDIRS) $(@D) @@ -1292,7 +1295,8 @@ preludes: {$(VPATH)}miniprelude.c {$(srcdir)}.rb.rbinc: $(ECHO) making $@ - $(Q) $(BASERUBY) $(tooldir)/mk_builtin_loader.rb $(SRC_FILE) + -$(Q) $(MAKE) $(DUMP_AST) + $(Q) $(BASERUBY) $(tooldir)/mk_builtin_loader.rb $(DUMP_AST) $(SRC_FILE) $(BUILTIN_BINARY:yes=built)in_binary.rbbin: $(PREP) $(BUILTIN_RB_SRCS) $(srcdir)/template/builtin_binary.rbbin.tmpl $(Q) $(MINIRUBY) $(tooldir)/generic_erb.rb -o $@ \ @@ -1302,7 +1306,11 @@ $(BUILTIN_BINARY:yes=built)in_binary.rbbin: $(PREP) $(BUILTIN_RB_SRCS) $(srcdir) $(BUILTIN_BINARY:no=builtin)_binary.rbbin: $(Q) echo> $@ // empty $(@F) -$(BUILTIN_RB_INCS): $(top_srcdir)/tool/mk_builtin_loader.rb +$(BUILTIN_RB_INCS): $(tooldir)/mk_builtin_loader.rb + +dump_ast$(EXEEXT): $(tooldir)/dump_ast.c $(LIBPRISM_OBJS) + $(ECHO) compiling $@ + $(Q) $(CC) $(CFLAGS) $(OUTFLAG)$@ $(INCFLAGS) $(tooldir)/dump_ast.c $(LIBPRISM_OBJS) $(srcdir)/revision.h$(no_baseruby:no=~disabled~): $(REVISION_H) diff --git a/configure.ac b/configure.ac index 1b59942463bad6..1dd17e06528518 100644 --- a/configure.ac +++ b/configure.ac @@ -111,6 +111,12 @@ AS_IF([test "$HAVE_BASERUBY" = no], [ AC_SUBST(BASERUBY) AC_SUBST(HAVE_BASERUBY) +AC_ARG_WITH(dump-ast, + AS_HELP_STRING([--with-dump-ast=DUMP_AST], [use DUMP_AST as dump_ast; for cross-compiling with a host-built dump_ast]), + [DUMP_AST=$withval], + [DUMP_AST='./dump_ast$(EXEEXT)']) +AC_SUBST(DUMP_AST) + : ${GIT=git} HAVE_GIT=yes AC_ARG_WITH(git, diff --git a/defs/gmake.mk b/defs/gmake.mk index 718131e937a4ae..2131d24631f714 100644 --- a/defs/gmake.mk +++ b/defs/gmake.mk @@ -413,6 +413,10 @@ ifneq ($(DOT_WAIT),) up:: $(DOT_WAIT) after-update endif +ifneq ($(CC),false) +$(BUILTIN_RB_INCS): $(DUMP_AST) +endif + ifneq ($(filter update-bundled_gems refresh-gems,$(MAKECMDGOALS)),) update-gems: update-bundled_gems endif diff --git a/depend b/depend index f1e2433346826d..3d050525fb655a 100644 --- a/depend +++ b/depend @@ -3639,6 +3639,30 @@ dmydln.$(OBJEXT): {$(VPATH)}st.h dmydln.$(OBJEXT): {$(VPATH)}subst.h dmyenc.$(OBJEXT): {$(VPATH)}dmyenc.c dmyext.$(OBJEXT): {$(VPATH)}dmyext.c +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/ast.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/defines.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/diagnostic.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/encoding.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/node.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/options.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/parser.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/prettyprint.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/prism.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/regexp.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/static_literals.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_arena.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_buffer.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_char.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_constant_pool.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_integer.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_line_offset_list.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_list.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_memchr.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_string.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_strncasecmp.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/util/pm_strpbrk.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/prism/version.h +dump_ast-dump_ast.$(OBJEXT): $(top_srcdir)/tool/dump_ast.c enc/ascii.$(OBJEXT): $(hdrdir)/ruby/ruby.h enc/ascii.$(OBJEXT): {$(VPATH)}assert.h enc/ascii.$(OBJEXT): {$(VPATH)}backward/2/assume.h diff --git a/prism/templates/src/node.c.erb b/prism/templates/src/node.c.erb index 5806742612066c..df59545129afba 100644 --- a/prism/templates/src/node.c.erb +++ b/prism/templates/src/node.c.erb @@ -173,7 +173,11 @@ pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *no // Dump the <%= field.name %> field pm_buffer_append_byte(buffer, ','); + <%- if field.is_a?(Prism::Template::Flags) -%> + pm_buffer_append_string(buffer, "\"flags\":", 8); + <%- else -%> pm_buffer_append_string(buffer, "\"<%= field.name %>\":", <%= field.name.bytesize + 3 %>); + <%- end -%> <%- case field -%> <%- when Prism::Template::NodeField -%> pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); diff --git a/template/Makefile.in b/template/Makefile.in index 8e93efc310cd97..3226daa7917b80 100644 --- a/template/Makefile.in +++ b/template/Makefile.in @@ -37,6 +37,7 @@ CONFIGURE = @CONFIGURE@ MKFILES = @MAKEFILES@ BASERUBY = @BASERUBY@ HAVE_BASERUBY = @HAVE_BASERUBY@ +DUMP_AST = @DUMP_AST@ TEST_RUNNABLE = @TEST_RUNNABLE@ CROSS_COMPILING = @CROSS_COMPILING@ DOXYGEN = @DOXYGEN@ diff --git a/tool/dump_ast.c b/tool/dump_ast.c new file mode 100644 index 00000000000000..593ecce8c44215 --- /dev/null +++ b/tool/dump_ast.c @@ -0,0 +1,64 @@ +#include +#include + +/* + * When prism is compiled as part of CRuby, the xmalloc/xfree/etc. macros are + * redirected to ruby_xmalloc/ruby_xfree/etc. Since this is a standalone + * program that links against those same object files, we need to provide + * implementations of these functions. + */ +void *ruby_xmalloc(size_t size) { return malloc(size); } +void *ruby_xcalloc(size_t nelems, size_t elemsiz) { return calloc(nelems, elemsiz); } +void *ruby_xrealloc(void *ptr, size_t newsiz) { return realloc(ptr, newsiz); } +void ruby_xfree(void *ptr) { free(ptr); } + +#include "prism.h" + +int +main(int argc, const char *argv[]) { + if (argc != 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + const char *filepath = argv[1]; + pm_string_t input; + + if (pm_string_mapped_init(&input, filepath) != PM_STRING_INIT_SUCCESS) { + fprintf(stderr, "unable to map file: %s\n", filepath); + return EXIT_FAILURE; + } + + pm_options_t options = { 0 }; + pm_options_line_set(&options, 1); + pm_options_filepath_set(&options, filepath); + + pm_arena_t arena = { 0 }; + pm_parser_t parser; + pm_parser_init(&arena, &parser, pm_string_source(&input), pm_string_length(&input), &options); + + pm_node_t *node = pm_parse(&parser); + int exit_status; + + if (parser.error_list.size > 0) { + fprintf(stderr, "error parsing %s\n", filepath); + for (const pm_diagnostic_t *diagnostic = (const pm_diagnostic_t *) parser.error_list.head; diagnostic != NULL; diagnostic = (const pm_diagnostic_t *) diagnostic->node.next) { + const pm_line_column_t line_column = pm_line_offset_list_line_column(&parser.line_offsets, diagnostic->location.start, parser.start_line); + fprintf(stderr, "%" PRIi32 ":%" PRIu32 ":%s\n", line_column.line, line_column.column, diagnostic->message); + } + exit_status = EXIT_FAILURE; + } else { + pm_buffer_t json = { 0 }; + pm_dump_json(&json, &parser, node); + printf("%.*s\n", (int) pm_buffer_length(&json), pm_buffer_value(&json)); + pm_buffer_free(&json); + exit_status = EXIT_SUCCESS; + } + + pm_parser_free(&parser); + pm_arena_free(&arena); + pm_string_free(&input); + pm_options_free(&options); + + return exit_status; +} diff --git a/tool/mk_builtin_loader.rb b/tool/mk_builtin_loader.rb index 3ab36ec45c323d..2965c651e9834b 100644 --- a/tool/mk_builtin_loader.rb +++ b/tool/mk_builtin_loader.rb @@ -1,6 +1,7 @@ # Parse built-in script and make rbinc file -require 'ripper' +require 'json' +require 'open3' require 'stringio' require_relative 'ruby_vm/helpers/c_escape' @@ -24,231 +25,204 @@ def self.reset Warning.extend CompileWarning -def string_literal(lit, str = []) - while lit - case lit.first - when :string_concat, :string_embexpr, :string_content - _, *lit = lit - lit.each {|s| string_literal(s, str)} - return str - when :string_literal - _, lit = lit - when :@tstring_content - str << lit[1] - return str - else - raise "unexpected #{lit.first}" - end - end -end +# ruby mk_builtin_loader.rb path/to/dump_ast TARGET_FILE.rb +# #=> generate TARGET_FILE.rbinc +# +# dump_ast is a standalone C program (tool/dump_ast.c) that parses Ruby files +# with prism and dumps the AST as JSON. It must be compiled with CC before this +# script can run, which means rbinc generation is skipped during `make up` +# (where CC=false). The rbinc files are gitignored build artifacts, so they do +# not need to be present in srcdir after `make up` — they will be generated in +# the build directory during `make all` once dump_ast has been compiled. + +LOCALS_DB = {} # [method_name, first_line] = locals -# e.g. [:symbol_literal, [:symbol, [:@ident, "inline", [19, 21]]]] -def symbol_literal(lit) - symbol_literal, symbol_lit = lit - raise "#{lit.inspect} was not :symbol_literal" if symbol_literal != :symbol_literal - symbol, ident_lit = symbol_lit - raise "#{symbol_lit.inspect} was not :symbol" if symbol != :symbol - ident, symbol_name, = ident_lit - raise "#{ident.inspect} was not :@ident" if ident != :@ident - symbol_name +# Extract the contents of the given string node. +def extract_string_literal(node) + case node["type"] + when "StringNode" + node["unescaped"] + when "InterpolatedStringNode" + node["parts"].map { |part| extract_string_literal(part) }.join + else + raise "unexpected #{node["type"]}" + end end -def inline_text argc, arg1 - raise "argc (#{argc}) of inline! should be 1" unless argc == 1 - arg1 = string_literal(arg1) - raise "1st argument should be string literal" unless arg1 - arg1.join("").rstrip +# Retrieve the line number of the given node in the source. +def line_number(source, node) + source.b.byteslice(0, node["location"]["start"]).count("\n") + 1 end -def inline_attrs(args) - raise "args was empty" if args.empty? - args.each do |arg| - attr = symbol_literal(arg) - unless BUILTIN_ATTRS.include?(attr) - raise "attr (#{attr}) was not in: #{BUILTIN_ATTRS.join(', ')}" - end +def visit_call_node(source, node, name, locals, requires, bs, inlines) + # If this is a call to require or require relative with a single string node + # argument, then we will attempt to find the file that is being required and + # add it to the files that should be processed. + if %w[require require_relative].include?(node["name"]) && !node["arguments"].nil? && (argument = node["arguments"]["arguments"][0])["type"] == "StringNode" + requires << argument["unescaped"] + return true end -end -def make_cfunc_name inlines, name, lineno - case name - when /\[\]/ - name = '_GETTER' - when /\[\]=/ - name = '_SETTER' + primitive_name = nil + + receiver = node["receiver"] + + if (!receiver.nil? && receiver["type"] == "ConstantReadNode" && receiver["name"] == "Primitive") || + (!receiver.nil? && receiver["type"] == "CallNode" && receiver["flags"].include?("VARIABLE_CALL") && receiver["name"] == "__builtin") + primitive_name = node["name"] + elsif node["name"].start_with?("__builtin_") + primitive_name = node["name"][10..-1] else - name = name.tr('!?', 'EP') + # If we get here, then this isn't a primitive function call and we can + # continue the visit. + return true end - base = "builtin_inline_#{name}_#{lineno}" - if inlines[base] - 1000.times{|i| - name = "#{base}_#{i}" - return name unless inlines[name] - } - raise "too many functions in same line..." - else - base + # The name of the C function that we will be calling for this call node. It + # may change later in this method depending on the type of primitive. + cfunction_name = primitive_name + + args = node["arguments"].nil? ? [] : node["arguments"]["arguments"] + argc = args.size + + if primitive_name.match?(/[\!\?]$/) + case (primitive_macro = primitive_name[0...-1]) + when "arg" + # This is a call to Primitive.arg!, which expects a single symbol argument + # detailing the name of the argument. + raise "unexpected argument number #{argc}" if argc != 1 + raise "symbol literal expected, got #{args[0]["type"]}" if args[0]["type"] != "SymbolNode" + return true + when "attr" + # This is a call to Primitive.attr!, which expects a list of known + # symbols. We will check that each of the arguments is a symbol and that + # the symbol is one of the known symbols. + raise "args was empty" if argc == 0 + + args.each do |arg| + raise "#{arg["type"]} was not a SymbolNode" if arg["type"] != "SymbolNode" + raise "attr (#{arg["unescaped"]}) was not in: leaf, inline_block, use_block" unless BUILTIN_ATTRS.include?(arg["unescaped"]) + end + + return true + when "mandatory_only" + # This is a call to Primitive.mandatory_only?. This method does not + # require any further processing. + return true + when "cstmt", "cexpr", "cconst", "cinit" + # This is a call to Primitive.cstmt!, Primitive.cexpr!, Primitive.cconst!, + # or Primitive.cinit!. These methods expect a single string argument that + # is the C code that should be executed. We will extract the string, emit + # an inline function, and then continue the visit. + raise "argc (#{argc}) of inline! should be 1" if argc != 1 + + text = extract_string_literal(args[0]).rstrip + lineno = line_number(source, node) + + case primitive_macro + when "cstmt", "cexpr", "cconst" + cfunction_name = "builtin_inline_#{name}_#{lineno}" + primitive_name = "_bi#{lineno}" + + if primitive_macro == "cstmt" + inlines << [cfunction_name, lineno, text, locals, primitive_name] + else + inlines << [cfunction_name, lineno, "return #{text};", primitive_macro == "cexpr" ? locals : nil, primitive_name] + end + when "cinit" + inlines << [inlines.size, lineno, text, nil, nil] + return true + end + + argc -= 1 + else + # This is a call to Primitive that is not a known method, so it must be a + # regular C function. In this case we do not need any special processing. + end end + + bs << [primitive_name, argc, cfunction_name] + return true end -def collect_locals tree - _type, name, (line, _cols) = tree - if locals = LOCALS_DB[[name, line]] - locals - else - if false # for debugging - pp LOCALS_DB - raise "not found: [#{name}, #{line}]" +def each_node(root, &blk) + return unless yield root + + root.each do |key, value| + next if key == "type" || key == "location" + + if value.is_a?(Hash) + each_node(value, &blk) if value.key?("type") + elsif value.is_a?(Array) && value[0].is_a?(Hash) + value.each { |node| each_node(node, &blk) } end end end -def collect_builtin base, tree, name, bs, inlines, locals = nil - while tree - recv = sep = mid = args = nil - case tree.first - when :def - locals = collect_locals(tree[1]) - tree = tree[3] - next - when :defs - locals = collect_locals(tree[3]) - tree = tree[5] - next - when :class - name = 'class' - tree = tree[3] - next - when :sclass, :module - name = 'class' - tree = tree[2] - next - when :method_add_arg - _method_add_arg, mid, (_arg_paren, args) = tree - case mid.first - when :call - _, recv, sep, mid = mid - when :fcall - _, mid = mid - else - mid = nil - end - # w/ trailing comma: [[:method_add_arg, ...]] - # w/o trailing comma: [:args_add_block, [[:method_add_arg, ...]], false] - if args && args.first == :args_add_block - args = args[1] - end - when :vcall - _, mid = tree - when :command # FCALL - _, mid, (_, args) = tree - when :call, :command_call # CALL - _, recv, sep, mid, (_, args) = tree +def visit_node(source, root, name, locals, requires, bs, inlines) + each_node(root) do |node| + case node["type"] + when "CallNode" + visit_call_node(source, node, name, locals, requires, bs, inlines) + when "DefNode" + lineno = line_number(source, node) + visit_node(source, node["body"], name, LOCALS_DB[[node["name"], lineno]], requires, bs, inlines) if node["body"] + false + when "ClassNode", "ModuleNode", "SingletonClassNode" + visit_node(source, node["body"], "class", nil, requires, bs, inlines) if node["body"] + false + else + true end + end +end - if mid - raise "unknown sexp: #{mid.inspect}" unless %i[@ident @const].include?(mid.first) - _, mid, (lineno,) = mid - if recv - func_name = nil - case recv.first - when :var_ref - _, recv = recv - if recv.first == :@const and recv[1] == "Primitive" - func_name = mid.to_s - end - when :vcall - _, recv = recv - if recv.first == :@ident and recv[1] == "__builtin" - func_name = mid.to_s - end - end - collect_builtin(base, recv, name, bs, inlines) unless func_name - else - func_name = mid[/\A__builtin_(.+)/, 1] - end - if func_name - cfunc_name = func_name - args.pop unless (args ||= []).last - argc = args.size - - if /(.+)[\!\?]\z/ =~ func_name - case $1 - when 'attr' - # Compile-time validation only. compile.c will parse them. - inline_attrs(args) - break - when 'cstmt' - text = inline_text argc, args.first - - func_name = "_bi#{lineno}" - cfunc_name = make_cfunc_name(inlines, name, lineno) - inlines[cfunc_name] = [lineno, text, locals, func_name] - argc -= 1 - when 'cexpr', 'cconst' - text = inline_text argc, args.first - code = "return #{text};" - - func_name = "_bi#{lineno}" - cfunc_name = make_cfunc_name(inlines, name, lineno) - - locals = [] if $1 == 'cconst' - inlines[cfunc_name] = [lineno, code, locals, func_name] - argc -= 1 - when 'cinit' - text = inline_text argc, args.first - func_name = nil # required - inlines[inlines.size] = [lineno, text, nil, nil] - argc -= 1 - when 'mandatory_only' - func_name = nil - when 'arg' - argc == 1 or raise "unexpected argument number #{argc}" - (arg = args.first)[0] == :symbol_literal or raise "symbol literal expected #{args}" - (arg = arg[1])[0] == :symbol or raise "symbol expected #{arg}" - (var = arg[1] and var = var[1]) or raise "argument name expected #{arg}" - func_name = nil - end - end +def collect_builtins(dump_ast, file) + stdout, stderr, status = Open3.capture3(dump_ast, file) + unless status.success? + warn(stderr) + exit(1) + end - if bs[func_name] && - bs[func_name] != [argc, cfunc_name] - raise "same builtin function \"#{func_name}\", but different arity (was #{bs[func_name]} but #{argc})" - end + source = File.read(file) + root = JSON.parse(stdout) + visit_node(source, root, "top", nil, requires = [], builtins = [], inlines = []) - bs[func_name] = [argc, cfunc_name] if func_name - elsif /\Arequire(?:_relative)\z/ =~ mid and args.size == 1 and - (arg1 = args[0])[0] == :string_literal and - (arg1 = arg1[1])[0] == :string_content and - (arg1 = arg1[1])[0] == :@tstring_content and - sublib = arg1[1] - if File.exist?(f = File.join(@dir, sublib)+".rb") - puts "- #{@base}.rb requires #{sublib}" - if REQUIRED[sublib] - warn "!!! #{sublib} is required from #{REQUIRED[sublib]} already; ignored" - else - REQUIRED[sublib] = @base - (SUBLIBS[@base] ||= []) << sublib - end - ARGV.push(f) - end + requires.each do |sublib| + if File.exist?(f = File.join(@dir, sublib)+".rb") + puts "- #{@base}.rb requires #{sublib}" + if REQUIRED[sublib] + warn "!!! #{sublib} is required from #{REQUIRED[sublib]} already; ignored" + else + REQUIRED[sublib] = @base + (SUBLIBS[@base] ||= []) << sublib end - break unless tree = args + ARGV.push(f) end + end - tree.each do |t| - collect_builtin base, t, name, bs, inlines, locals if Array === t + processed_builtins = {} + builtins.each do |(primitive_name, argc, cfunction_name)| + if processed_builtins.key?(primitive_name) && processed_builtins[primitive_name] != [argc, cfunction_name] + raise "same builtin function \"#{primitive_name}\", but different arity (was #{processed_builtins[primitive_name]} but #{argc})" end - break + + processed_builtins[primitive_name] = [argc, cfunction_name] end -end -# ruby mk_builtin_loader.rb TARGET_FILE.rb -# #=> generate TARGET_FILE.rbinc -# + processed_inlines = {} + inlines.each do |(cfunction_name, lineno, text, locals, primitive_name)| + if processed_inlines.key?(cfunction_name) + found = 1000.times.find { |i| !processed_inlines.key?("#{cfunction_name}_#{i}") } + raise "too many functions in same line..." unless found + cfunction_name = "#{cfunction_name}_#{found}" + end -LOCALS_DB = {} # [method_name, first_line] = locals + processed_inlines[cfunction_name] = [lineno, text, locals, primitive_name] + end + + [processed_builtins, processed_inlines] +end def collect_iseq iseq_ary # iseq_ary.each_with_index{|e, i| p [i, e]} @@ -313,24 +287,24 @@ def generate_cexpr(ofile, lineno, line_file, body_lineno, text, locals, func_nam return lineno, f.string end -def mk_builtin_header file +def mk_builtin_header dump_ast, file @dir = File.dirname(file) base = File.basename(file, '.rb') @base = base ofile = "#{file}inc" - # bs = { func_name => argc } - code = File.read(file) begin verbose, $VERBOSE = $VERBOSE, true - collect_iseq RubyVM::InstructionSequence.compile(code, base).to_a + collect_iseq RubyVM::InstructionSequence.compile_file(file).to_a ensure $VERBOSE = verbose end if warnings = CompileWarning.reset raise "#{warnings} warnings in #{file}" end - collect_builtin(base, Ripper.sexp(code), 'top', bs = {}, inlines = {}) + + # bs = { func_name => argc } + bs, inlines = collect_builtins(dump_ast, file) StringIO.open do |f| if File::ALT_SEPARATOR @@ -423,7 +397,15 @@ def mk_builtin_header file end end +dump_ast = ARGV.shift +if !File.executable?(dump_ast) + # dump_ast may not be available during `make up` (CC=false). In that case, + # silently skip rbinc generation — the files will be generated during the + # actual build when CC is available and dump_ast has been compiled. + exit +end + ARGV.each{|file| # feature.rb => load_feature.inc - mk_builtin_header file + mk_builtin_header dump_ast, file } diff --git a/tool/prereq.status b/tool/prereq.status index 6aca615e90ba4d..78b5c2228bf5a9 100644 --- a/tool/prereq.status +++ b/tool/prereq.status @@ -14,6 +14,7 @@ s,@CPPFLAGS@,,g s,@CXXFLAGS@,,g s,@DLDFLAGS@,,g s,@DTRACE_EXT@,dmyh,g +s,@DUMP_AST@,./dump_ast,g s,@EXEEXT@,,g s,@HAVE_BASERUBY@,yes,g s,@IFCHANGE@,tool/ifchange,g diff --git a/win32/Makefile.sub b/win32/Makefile.sub index 1115dd60c0d99f..d3c8475fdbaab3 100644 --- a/win32/Makefile.sub +++ b/win32/Makefile.sub @@ -558,6 +558,8 @@ ACTIONS_ENDGROUP = @:: ABI_VERSION_HDR = $(hdrdir)/ruby/internal/abi.h +DUMP_AST = dump_ast$(EXEEXT) + !include $(srcdir)/common.mk !ifdef SCRIPTPROGRAMS From c0e41097b0c815049e6290e0a3b212b829292bad Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 12 Mar 2026 11:38:40 -0400 Subject: [PATCH 2/7] Speed up memmem on Apple Apple's libc implementation of memmem is super slow (it is a forked version of freebsd's that never got vectorized). Instead, we should fall back to the rolling hash on Apple. In the attached benchmark, I'm seeing 1.07% slower to 30.34% slower, depending on the haystack. For reference, here are the various implementations I checked: * musl: https://git.musl-libc.org/cgit/musl/tree/src/string/memmem.c * freebsd: https://github.com/freebsd/freebsd-src/blob/main/lib/libc/string/memmem.c * apple: https://github.com/apple-oss-distributions/Libc/blob/main/string/FreeBSD/memmem.c You can see Apple just linearly searches through the string and calls memcmp each time, whereas the other two do a window'd rolling hash similar to the fallback Ruby already has. --- benchmark/string_memsearch.yml | 75 ++++++++++++++++++++++++++++++++++ re.c | 2 +- 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 benchmark/string_memsearch.yml diff --git a/benchmark/string_memsearch.yml b/benchmark/string_memsearch.yml new file mode 100644 index 00000000000000..cde363289a5187 --- /dev/null +++ b/benchmark/string_memsearch.yml @@ -0,0 +1,75 @@ +prelude: | + # Haystacks of various sizes + small_hay = "a" * 256 + medium_hay = "a" * 4096 + large_hay = "a" * 65536 + + # Short needles (2-8 bytes) that exercise rb_memsearch_ss + needle_2 = "xy" + needle_4 = "xyzw" + needle_8 = "xyzwabcd" + + # Needle whose first byte is absent from the haystack (memchr fast-path) + # vs needle whose first byte is common (rolling hash comparison) + first_byte_absent = "x" + "a" * 3 + first_byte_common = "a" + "x" * 3 + + # Haystack with match at the end + hay_match_end = "a" * 4095 + "xy" + + # Haystack with match at the start + hay_match_start = "xy" + "a" * 4094 + + # Mixed content haystack (more realistic) + mixed_hay = (("abcdefghij" * 100) + "z") * 10 + +benchmark: + # === First byte absent from haystack (biggest win for rolling hash) === + index_first_byte_absent_small: | + small_hay.index(first_byte_absent) + index_first_byte_absent_medium: | + medium_hay.index(first_byte_absent) + index_first_byte_absent_large: | + large_hay.index(first_byte_absent) + + # === First byte common in haystack (stresses comparison loop) === + index_first_byte_common_small: | + small_hay.index(first_byte_common) + index_first_byte_common_medium: | + medium_hay.index(first_byte_common) + index_first_byte_common_large: | + large_hay.index(first_byte_common) + + # === Needle length variations (all absent) === + index_needle_2_absent: | + medium_hay.index(needle_2) + index_needle_4_absent: | + medium_hay.index(needle_4) + index_needle_8_absent: | + medium_hay.index(needle_8) + + # === Match at end of haystack === + index_match_at_end: | + hay_match_end.index(needle_2) + + # === Match at start of haystack === + index_match_at_start: | + hay_match_start.index(needle_2) + + # === include? (same code path) === + include_first_byte_absent: | + medium_hay.include?(first_byte_absent) + include_first_byte_common: | + medium_hay.include?(first_byte_common) + + # === byteindex === + byteindex_first_byte_absent: | + medium_hay.byteindex(first_byte_absent) + byteindex_first_byte_common: | + medium_hay.byteindex(first_byte_common) + + # === Mixed/realistic haystack === + index_mixed_absent: | + mixed_hay.index(needle_4) + index_mixed_present: | + mixed_hay.index("ijab") diff --git a/re.c b/re.c index 0e169694d4e536..9d50ae2d7e882a 100644 --- a/re.c +++ b/re.c @@ -106,7 +106,7 @@ rb_memcicmp(const void *x, const void *y, long len) return 0; } -#ifdef HAVE_MEMMEM +#if defined(HAVE_MEMMEM) && !defined(__APPLE__) static inline long rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) { From 30ec9c089e322a05f89bf5fc3830b6ce0b4ab45c Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Wed, 11 Mar 2026 07:47:16 -0400 Subject: [PATCH 3/7] Fix use-after-poison in compile.c and prism_compile.c Prevent GC from accidentally collecting --- compile.c | 1 + prism_compile.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/compile.c b/compile.c index ecc19f6b651d72..1cd1e8e492f9e1 100644 --- a/compile.c +++ b/compile.c @@ -5375,6 +5375,7 @@ compile_hash(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *node, int meth } VALUE hash = rb_hash_new_with_size(RARRAY_LEN(ary) / 2); rb_hash_bulk_insert(RARRAY_LEN(ary), RARRAY_CONST_PTR(ary), hash); + RB_GC_GUARD(ary); hash = RB_OBJ_SET_FROZEN_SHAREABLE(rb_obj_hide(hash)); /* Emit optimized code */ diff --git a/prism_compile.c b/prism_compile.c index 6bc1da58d0bab0..3fa24029412308 100644 --- a/prism_compile.c +++ b/prism_compile.c @@ -863,6 +863,7 @@ pm_static_literal_value(rb_iseq_t *iseq, const pm_node_t *node, pm_scope_node_t VALUE value = rb_hash_new_with_size(elements->size); rb_hash_bulk_insert(RARRAY_LEN(array), RARRAY_CONST_PTR(array), value); + RB_GC_GUARD(array); value = rb_obj_hide(value); RB_OBJ_SET_FROZEN_SHAREABLE(value); @@ -1533,6 +1534,7 @@ pm_compile_hash_elements(rb_iseq_t *iseq, const pm_node_t *node, const pm_node_l VALUE hash = rb_hash_new_with_size(RARRAY_LEN(ary) / 2); rb_hash_bulk_insert(RARRAY_LEN(ary), RARRAY_CONST_PTR(ary), hash); + RB_GC_GUARD(ary); hash = rb_obj_hide(hash); RB_OBJ_SET_FROZEN_SHAREABLE(hash); From 9246b770cd2cfd93fb1849f8b413d159d1fd256b Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Fri, 13 Mar 2026 09:02:35 -0400 Subject: [PATCH 4/7] Bump timeout for TestGC#test_finalizer_not_run_with_vm_lock This test should be redesigned, but increasing the timeout should probably be good enough for now to not see CI failures related to it. I can see how this could timeout after 10s. Failing test after 10s timeout: https://ci.rvm.jp/results/trunk_gcc10@ruby-sp2-noble-docker/6247393 According to the stack trace at time of abort, there was no deadlock and it was in the middle of a GC. Everything looks fine. This is assuming the fatal signal came from `EnvUtil.terminate`. --- test/ruby/test_gc.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ruby/test_gc.rb b/test/ruby/test_gc.rb index 627b3227ee872a..dc26efada0a058 100644 --- a/test/ruby/test_gc.rb +++ b/test/ruby/test_gc.rb @@ -897,7 +897,7 @@ def test_old_to_young_reference end def test_finalizer_not_run_with_vm_lock - assert_ractor(<<~'RUBY') + assert_ractor(<<~'RUBY', timeout: 30) Thread.new do loop do Encoding.list.each do |enc| From 17747554a64559565518495ba7544c114b74869c Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Tue, 10 Mar 2026 15:21:14 +0100 Subject: [PATCH 5/7] [ruby/prism] Make it possible to lazily deserialize DefNode in Loader.java * TRUFFLERUBY_METRICS_REPS=5 jt metrics time --experimental-options -e0 For parsing-core: before: 0.097 0.099 0.092 0.096 after: 0.061 0.063 0.066 0.059 * Remove extra trailing spaces by using `<%-#`. https://github.com/ruby/prism/commit/e08b47e26c --- prism/templates/src/serialize.c.erb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb index 1f90a2160ea4d2..78e4f348932f45 100644 --- a/prism/templates/src/serialize.c.erb +++ b/prism/templates/src/serialize.c.erb @@ -50,8 +50,6 @@ static void pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { pm_buffer_append_byte(buffer, (uint8_t) PM_NODE_TYPE(node)); - size_t offset = buffer->length; - <%- if Prism::Template::INCLUDE_NODE_ID -%> pm_buffer_append_varuint(buffer, node->node_id); <%- end -%> @@ -126,7 +124,7 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { <%- end -%> <%- if node.needs_serialized_length? -%> // serialize length - uint32_t length = pm_sizet_to_u32(buffer->length - offset - sizeof(uint32_t)); + uint32_t length = pm_sizet_to_u32(buffer->length - length_offset); memcpy(buffer->value + length_offset, &length, sizeof(uint32_t)); <%- end -%> break; From fc16d959d01bae0694b863d4cfa7802130b875fb Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Fri, 13 Mar 2026 11:29:29 -0400 Subject: [PATCH 6/7] [ruby/prism] Expose parse options to Rust https://github.com/ruby/prism/commit/0f1500ce92 --- prism/util/pm_string.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prism/util/pm_string.h b/prism/util/pm_string.h index d8456ff2947eb8..76942180b6eecb 100644 --- a/prism/util/pm_string.h +++ b/prism/util/pm_string.h @@ -94,7 +94,7 @@ void pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length); * @param source The source of the string. * @param length The length of the string. */ -void pm_string_constant_init(pm_string_t *string, const char *source, size_t length); +PRISM_EXPORTED_FUNCTION void pm_string_constant_init(pm_string_t *string, const char *source, size_t length); /** * Represents the result of calling pm_string_mapped_init or From ee275b41215315236158827baaa3cc042865ce43 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Fri, 13 Mar 2026 12:41:52 -0400 Subject: [PATCH 7/7] Git ignore dump_ast if you are doing an in-place build --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 7bf0420d87b744..0671971dbc0217 100644 --- a/.gitignore +++ b/.gitignore @@ -273,6 +273,7 @@ lcov*.info /prism/serialize.c /prism/token_type.c /prism/srcs.mk +/dump_ast # tool/update-NEWS-gemlist.rb /bundled_gems.json