From 46cc1f82eabffef0d3098f721c5e1036226095ed Mon Sep 17 00:00:00 2001
From: tompng
#{CGI.escapeHTML text}\n\n"
diff --git a/lib/rdoc/parser/prism_ruby.rb b/lib/rdoc/parser/prism_ruby.rb
index 56da6ac227..c18b37e8b3 100644
--- a/lib/rdoc/parser/prism_ruby.rb
+++ b/lib/rdoc/parser/prism_ruby.rb
@@ -1,7 +1,7 @@
# frozen_string_literal: true
require 'prism'
-require_relative 'ripper_state_lex'
+require_relative 'tokenizer'
# Unlike lib/rdoc/parser/ruby.rb, this file is not based on rtags and does not contain code from
# rtags.rb -
@@ -89,10 +89,13 @@ def record_location(container) # :nodoc:
# Scans this Ruby file for Ruby constructs
def scan
- @tokens = RDoc::Parser::RipperStateLex.parse(@content)
@lines = @content.lines
- result = Prism.parse(@content)
- @program_node = result.value
+ result = Prism.parse_lex(@content)
+ @prism_comments = result.comments
+ @program_node, unordered_tokens = result.value
+ # Heredoc tokens are not in start_offset order.
+ # Need to sort them to use bsearch for finding tokens by location.
+ @prism_tokens = unordered_tokens.map(&:first).sort_by { |t| t.location.start_offset }
@line_nodes = {}
prepare_line_nodes(@program_node)
prepare_comments(result.comments)
@@ -205,7 +208,7 @@ def parse_comment_tomdoc(container, comment, line_no, start_line)
meth.start_collecting_tokens(:ruby)
node = @line_nodes[line_no]
- tokens = node ? visible_tokens_from_location(node.location) : [file_line_comment_token(start_line)]
+ tokens = node ? visible_tokens_from_node(node) : [file_line_comment_token(start_line)]
tokens.each { |token| meth.token_stream << token }
container.add_method meth
@@ -273,7 +276,7 @@ def handle_meta_method_comment(comment, directives, node)
elsif line_no || node
method_name ||= call_node_name_arguments(node).first if is_call_node
if node
- tokens = visible_tokens_from_location(node.location)
+ tokens = visible_tokens_from_node(node)
line_no = node.location.start_line
else
tokens = [file_line_comment_token(line_no)]
@@ -368,30 +371,21 @@ def parse_comment_text_to_directives(comment_text, start_line) # :nodoc:
[comment, directives]
end
- def slice_tokens(start_pos, end_pos) # :nodoc:
- start_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> start_pos) >= 0 }
- end_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> end_pos) >= 0 }
- tokens = @tokens[start_index...end_index]
- tokens.pop if tokens.last&.kind == :on_nl
- tokens
- end
-
def file_line_comment_token(line_no) # :nodoc:
- position_comment = RDoc::Parser::RipperStateLex::Token.new(line_no - 1, 0, :on_comment)
- position_comment[:text] = "# File #{@top_level.relative_name}, line #{line_no}"
- position_comment
+ text = "# File #{@top_level.relative_name}, line #{line_no}"
+ RDoc::TokenStream::RipperStateLexCompatToken.new(:on_comment, text)
end
- # Returns tokens from the given location
+ # Returns tokens of the given node's location for syntax highlighting
- def visible_tokens_from_location(location)
+ def visible_tokens_from_node(node)
+ location = node.location
position_comment = file_line_comment_token(location.start_line)
- newline_token = RDoc::Parser::RipperStateLex::Token.new(0, 0, :on_nl, "\n")
- indent_token = RDoc::Parser::RipperStateLex::Token.new(location.start_line, 0, :on_sp, ' ' * location.start_character_column)
- tokens = slice_tokens(
- [location.start_line, location.start_character_column],
- [location.end_line, location.end_character_column]
- )
+ newline_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_nl, "\n")
+ indent_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_sp, ' ' * location.start_character_column)
+ tokens = RDoc::Parser::Tokenizer.partial_tokenize(@content, node, @prism_tokens, @prism_comments).map do |type, text|
+ RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
+ end
[position_comment, newline_token, indent_token, *tokens]
end
@@ -894,7 +888,7 @@ def visit_def_node(node)
end
name = node.name.to_s
params, block_params, calls_super = MethodSignatureVisitor.scan_signature(node)
- tokens = @scanner.visible_tokens_from_location(node.location)
+ tokens = @scanner.visible_tokens_from_node(node)
@scanner.add_method(
name,
diff --git a/lib/rdoc/parser/tokenizer.rb b/lib/rdoc/parser/tokenizer.rb
new file mode 100644
index 0000000000..cc8dd181e1
--- /dev/null
+++ b/lib/rdoc/parser/tokenizer.rb
@@ -0,0 +1,243 @@
+require 'prism'
+require 'set'
+
+# Tokenize Ruby code as RDoc::Parser::RipperStateLex style types and token squashing.
+# Token squashing is required by RDoc::TokenStream's syntax highlighting.
+module RDoc::Parser::Tokenizer
+ # This constants and token type map are for compatibility with RDoc::Parser::RipperStateLex.
+ OTHER = :other
+ SPACE = :on_sp
+ NEWLINE = :on_nl
+ KEYWORD = :on_kw
+ OP = :on_op
+ HEREDOC_BEG = :on_heredoc_beg
+ HEREDOC_CONTENT = :on_heredoc
+ HEREDOC_END = :on_heredoc_end
+ COMMENT = :on_comment
+ INTEGER = :on_int
+ FLOAT = :on_float
+ RATIONAL = :on_rational
+ IMAGINARY = :on_imaginary
+ SYMBOL = :on_symbol
+ REGEXP = :on_regexp
+ STRING = :on_tstring
+ WORDS = :on_dstring
+ DEF_METHOD_NAME = :on_ident
+ DSTRING = :on_dstring
+
+ OP_TOKENS = %i[
+ AMPERSAND AMPERSAND_AMPERSAND
+ BANG BANG_EQUAL BANG_TILDE CARET COLON COLON_COLON
+ EQUAL EQUAL_EQUAL EQUAL_GREATER EQUAL_TILDE
+ GREATER GREATER_GREATER
+ LESS LESS_EQUAL LESS_EQUAL_GREATER LESS_LESS
+ MINUS MINUS_GREATER PERCENT PIPE PIPE_PIPE PLUS
+ QUESTION_MARK SLASH STAR STAR_STAR TILDE
+ UAMPERSAND UMINUS UPLUS USTAR USTAR_STAR
+ ].to_set
+
+ TOKEN_TYPE_MAP = {
+ IDENTIFIER: :on_ident,
+ METHOD_NAME: :on_ident,
+ INSTANCE_VARIABLE: :on_ivar,
+ CLASS_VARIABLE: :on_cvar,
+ GLOBAL_VARIABLE: :on_gvar,
+ BACK_REFERENCE: :on_backref,
+ NUMBERED_REFERENCE: :on_backref,
+ CONSTANT: :on_const,
+ LABEL: :on_label,
+ INTEGER: :on_int,
+ FLOAT: :on_float,
+ RATIONAL: :on_rational,
+ IMAGINARY: :on_imaginary,
+ }
+
+ class << self
+ def tokenize(code)
+ result = Prism.parse_lex(code)
+ program_node, unordered_tokens = result.value
+ prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
+ partial_tokenize(code, program_node, prism_tokens, result.comments, 0, code.bytesize)
+ end
+
+ def partial_tokenize(whole_code, node, prism_tokens, prism_comments, start_offset = nil, end_offset = nil)
+ start_offset ||= node.location.start_offset
+ end_offset ||= node.location.end_offset
+ visitor = SquashTokenVisitor.new
+ node.accept(visitor)
+ squashed_tokens = visitor.tokens
+ comment_tokens = comment_tokens(slice_by_location(prism_comments, start_offset, end_offset))
+ normal_tokens = normal_tokens(slice_by_location(prism_tokens, start_offset, end_offset))
+ prior_tokens = (squashed_tokens + comment_tokens).sort_by {|_, start_offset, _| start_offset }
+ unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset)
+ end
+
+ private
+
+ def slice_by_location(items, start_offset, end_offset)
+ start_index = items.bsearch_index { |item| item.location.end_offset > start_offset } || items.size
+ end_index = items.bsearch_index { |item| item.location.start_offset >= end_offset } || items.size
+ items[start_index...end_index]
+ end
+
+ # Unify prior tokens and normal tokens into a token stream.
+ # Prior tokens have higher priority than normal tokens.
+ # Also adds missing text (spaces, newlines, etc.) as separate tokens
+ # so that the entire code is covered.
+ def unify_tokens(code, prior_tokens, normal_tokens, start_offset, end_offset)
+ tokens = []
+ offset = start_offset
+
+ # Add missing text such as spaces and newlines as a separate token
+ flush = -> next_offset {
+ return if offset == next_offset
+
+ code.byteslice(offset...next_offset).scan(/\n|\s+|[^\s]+/) do |text|
+ type =
+ if text == "\n"
+ NEWLINE
+ elsif /\A\s+\z/.match?(text)
+ SPACE
+ else
+ OTHER
+ end
+ tokens << [type, text]
+ end
+ }
+
+ until prior_tokens.empty? && normal_tokens.empty?
+ ptok = prior_tokens.first
+ ntok = normal_tokens.first
+ if ntok && (!ptok || ntok[2] <= ptok[1])
+ token = normal_tokens.shift
+ else
+ token = prior_tokens.shift
+ end
+ type, start_pos, end_pos = token
+ next if start_pos < offset
+
+ flush.call(start_pos)
+ tokens << [type, code.byteslice(start_pos...end_pos)]
+ offset = end_pos
+ end
+ flush.call(end_offset)
+ tokens
+ end
+
+ # Extract normal comment and embdoc comment (consists of multiple tokens) as a single token
+ def comment_tokens(comments)
+ comments.map do |comment|
+ [COMMENT, comment.location.start_offset, comment.location.end_offset]
+ end
+ end
+
+ # Convert normal Prism tokens to [type, start_offset, end_offset]
+ def normal_tokens(tokens)
+ tokens.map do |token,|
+ type =
+ if token.type.start_with?('KEYWORD_')
+ KEYWORD
+ elsif OP_TOKENS.include?(token.type.to_sym)
+ OP
+ else
+ TOKEN_TYPE_MAP[token.type] || OTHER
+ end
+ [type, token.location.start_offset, token.location.end_offset]
+ end
+ end
+ end
+
+ # Visitor to squash several tokens that consist a single node into a single token
+ class SquashTokenVisitor < Prism::Visitor
+ attr_reader :tokens
+ def initialize
+ @tokens = []
+ end
+
+ # Squash UMINUS and its operand(integer, float, rational, imaginary) token into a single token
+ def visit_integer_node(node)
+ push_location(node.location, INTEGER)
+ end
+
+ def visit_float_node(node)
+ push_location(node.location, FLOAT)
+ end
+
+ def visit_rational_node(node)
+ push_location(node.location, RATIONAL)
+ end
+
+ def visit_imaginary_node(node)
+ push_location(node.location, IMAGINARY)
+ end
+
+ def visit_symbol_node(node)
+ push_location(node.location, SYMBOL)
+ end
+ alias visit_interpolated_symbol_node visit_symbol_node
+
+ def visit_regular_expression_node(node)
+ push_location(node.location, REGEXP)
+ end
+ alias visit_match_last_line_node visit_regular_expression_node
+ alias visit_interpolated_regular_expression_node visit_regular_expression_node
+ alias visit_interpolated_match_last_line_node visit_regular_expression_node
+
+ def visit_string_node(node)
+ # opening of StringNode inside InterpolatedStringNode might be nil
+ if node.opening&.start_with?('<<')
+ push_location(node.opening_loc, HEREDOC_BEG)
+ push_location(node.content_loc, HEREDOC_CONTENT)
+ push_location(node.closing_loc, HEREDOC_END)
+ else
+ push_location(node.location, STRING)
+ end
+ end
+ alias visit_x_string_node visit_string_node
+
+ def visit_array_node(node)
+ # Right hand side of `a = 1,2` is an array node without opening
+ if node.opening&.start_with?('%')
+ # Percent array: squash entire node into a single token.
+ # We don't handle embedded expressions inside yet.
+ push_location(node.location, WORDS)
+ else
+ super
+ end
+ end
+
+ def push_location(location, type)
+ @tokens << [type, location.start_offset, location.end_offset]
+ end
+
+ def visit_def_node(node)
+ # For special colorizing of method name in def node
+ push_location(node.name_loc, DEF_METHOD_NAME)
+ super
+ end
+
+ def visit_interpolated_string_node(node)
+ # `"a" "b"` is an interpolated string node without opening
+ if node.opening&.start_with?('<<')
+ # Heredocs. Squash content into a single token.
+ # We don't tokenize embedded expressions inside, and don't handle nested heredocs yet.
+ push_location(node.opening_loc, HEREDOC_BEG)
+ unless node.parts.empty?
+ # Squash heredoc content into a single token
+ part_locations = node.parts.map(&:location)
+ @tokens << [
+ HEREDOC_CONTENT,
+ part_locations.map(&:start_offset).min,
+ part_locations.map(&:end_offset).max
+ ]
+ end
+ # incomplete heredoc might not have closing_loc
+ push_location(node.closing_loc, HEREDOC_END) if node.closing_loc
+ else
+ # Squash entire node into a single token
+ push_location(node.location, DSTRING)
+ end
+ end
+ alias visit_interpolated_x_string_node visit_interpolated_string_node
+ end
+end
diff --git a/lib/rdoc/token_stream.rb b/lib/rdoc/token_stream.rb
index 5a4ca82a67..07f83862c1 100644
--- a/lib/rdoc/token_stream.rb
+++ b/lib/rdoc/token_stream.rb
@@ -9,6 +9,8 @@
module RDoc::TokenStream
+ RipperStateLexCompatToken = Struct.new(:kind, :text, :line_no, :char_no, :state)
+
##
# Converts +token_stream+ to HTML wrapping various tokens with
# elements. Some tokens types are wrapped in spans
diff --git a/test/rdoc/parser/prism_ruby_test.rb b/test/rdoc/parser/prism_ruby_test.rb
index e393f70e3e..e12a9751bf 100644
--- a/test/rdoc/parser/prism_ruby_test.rb
+++ b/test/rdoc/parser/prism_ruby_test.rb
@@ -2065,6 +2065,24 @@ def test_read_directive_linear_performance
end
end
+ def test_code_object_token_stream
+ util_parser <<~RUBY
+ class Foo
+ def foo
+ 42
+ end
+
+ private def bar
+ :bar
+ end
+ end
+ RUBY
+
+ foo, bar = @top_level.classes.first.method_list
+ # Skip first two tokens: location comment and newline
+ assert_equal([' ', 'def', ' ', 'foo', "\n", ' ', '42', "\n", ' ', 'end'], foo.token_stream.drop(2).map(&:text))
+ assert_equal([' ', 'def', ' ', 'bar', "\n", ' ', ':bar', "\n", ' ', 'end'], bar.token_stream.drop(2).map(&:text))
+ end
def test_markup_first_comment
util_parser <<~RUBY
diff --git a/test/rdoc/parser/tokenizer_test.rb b/test/rdoc/parser/tokenizer_test.rb
new file mode 100644
index 0000000000..07cbcb1a8a
--- /dev/null
+++ b/test/rdoc/parser/tokenizer_test.rb
@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require_relative '../helper'
+require 'rdoc/parser/tokenizer'
+
+class RDocParserTokenizerTest < RDoc::TestCase
+ def test_partial_tokenize
+ code = <<~RUBY
+ class A
+ def m
+ # comment
+ 42
+ end
+ end
+ RUBY
+ parse_result = Prism.parse_lex(code)
+ program_node, unordered_tokens = parse_result.value
+ prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
+ def_node = program_node.statements.body[0].body.body[0]
+ tokens = RDoc::Parser::Tokenizer.partial_tokenize(code, def_node, prism_tokens, parse_result.comments)
+ expected = ['def', ' ', 'm', "\n", ' ', '# comment', "\n", ' ', '42', "\n", ' ', 'end']
+ assert_equal(expected, tokens.map(&:last))
+ end
+
+ def test_comment
+ code = <<~RUBY
+ # comment1
+ class A
+ =begin
+ comment2
+ =end
+ def m
+ 42 # comment3
+ end
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ assert_include(tokens, [:on_comment, '# comment1'])
+ assert_include(tokens, [:on_comment, "=begin\ncomment2\n=end\n"])
+ assert_include(tokens, [:on_comment, '# comment3'])
+ end
+
+ def test_squash_uminus
+ code = <<~RUBY
+ def m
+ -42; -4.2; -42i; -42r
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ assert_include(tokens, [:on_int, '-42'])
+ assert_include(tokens, [:on_float, '-4.2'])
+ assert_include(tokens, [:on_imaginary, '-42i'])
+ assert_include(tokens, [:on_rational, '-42r'])
+ end
+
+ def test_squash_interpolated_node
+ code = <<~'RUBY'
+ def m
+ "string#{interpolation}example"
+ /regexp#{interpolation}example/
+ :"symbol#{interpolation}example"
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ assert_include(tokens, [:on_dstring, '"string#{interpolation}example"'])
+ assert_include(tokens, [:on_regexp, '/regexp#{interpolation}example/'])
+ assert_include(tokens, [:on_symbol, ':"symbol#{interpolation}example"'])
+ end
+
+ def test_squash_words
+ code = <<~RUBY
+ def m
+ a = 1, 2 # array without opening. %w[] squashing should not fail with this input
+ %w[one two three]
+ %W[one \#{two} three]
+ %i[one two three]
+ %I[one \#{two} three]
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ assert_include(tokens, [:on_dstring, '%w[one two three]'])
+ assert_include(tokens, [:on_dstring, '%W[one #{two} three]'])
+ assert_include(tokens, [:on_dstring, '%i[one two three]'])
+ assert_include(tokens, [:on_dstring, '%I[one #{two} three]'])
+ end
+
+ def test_multibyte
+ code = <<~RUBY
+ def f(s = '💎')
+ # comment 💎
+ puts '💎' + s
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ end
+
+ def test_string_concat_node
+ # concatenated string node has no opening
+ code = <<~'RUBY'
+ def f
+ %[hello] 'HELLO'\
+ "world"
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ end
+
+ def test_squash_heredoc
+ code = <<~'RUBY'
+ def f
+ str1 = <<~AA
+ single-line-heredoc
+ AA
+ str2 = <<~`BB` # comment
+ x-string-heredoc
+ BB
+ str3 = <<~CC.itself
+ multi-line
+ #{embed}
+ heredoc
+ CC
+ end
+ RUBY
+ tokens = RDoc::Parser::Tokenizer.tokenize(code)
+ assert_equal(code, tokens.map(&:last).join)
+ assert_include(tokens, [:on_heredoc_beg, '<<~AA'])
+ assert_include(tokens, [:on_heredoc_beg, '<<~`BB`'])
+ assert_include(tokens, [:on_heredoc_beg, '<<~CC'])
+ assert_include(tokens, [:on_heredoc_end, " AA\n"])
+ assert_include(tokens, [:on_heredoc_end, " BB\n"])
+ assert_include(tokens, [:on_heredoc_end, " CC\n"])
+ assert_include(tokens, [:on_heredoc, " single-line-heredoc\n"])
+ assert_include(tokens, [:on_heredoc, " x-string-heredoc\n"])
+ assert_include(tokens, [:on_heredoc, " multi-line\n \#{embed}\n heredoc\n"])
+ end
+end