Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 24 additions & 14 deletions lib/rdoc/markup/to_html.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true
require 'cgi/escape'
require 'cgi/util' unless defined?(CGI::EscapeExt)
require 'rdoc/parser/ripper_state_lex'

##
# Outputs RDoc markup as HTML.
Expand Down Expand Up @@ -216,6 +217,23 @@ def accept_paragraph(paragraph)
@res << "</p>\n"
end

# Generate syntax highlighted html for ruby-like text.

def parsable_text_to_html(text)
if defined?(RDoc::Parser::PrismRuby) && RDoc::Parser::Ruby == RDoc::Parser::PrismRuby
tokens = RDoc::Parser::Tokenizer.tokenize(text).map do |type, text|
RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
end
else
# RipperStateLex.parse is assumed to fail in some cases.
# Failing input is unknown.
tokens = RDoc::Parser::RipperStateLex.parse(text) rescue return
end
result = RDoc::TokenStream.to_html tokens
result = result + "\n" unless "\n" == result[-1]
result
end

##
# Adds +verbatim+ to the output

Expand All @@ -224,20 +242,12 @@ def accept_verbatim(verbatim)

klass = nil

content = if verbatim.ruby? or parseable? text then
begin
tokens = RDoc::Parser::RipperStateLex.parse text
klass = ' class="ruby"'

result = RDoc::TokenStream.to_html tokens
result = result + "\n" unless "\n" == result[-1]
result
rescue
CGI.escapeHTML text
end
else
CGI.escapeHTML text
end
if verbatim.ruby? || parseable?(text)
content = parsable_text_to_html(text)
klass = ' class="ruby"' if content # RDoc::Parser::RipperStateLex.parse may fail
end

content ||= CGI.escapeHTML text

if @options.pipe then
@res << "\n<pre><code>#{CGI.escapeHTML text}\n</code></pre>\n"
Expand Down
46 changes: 20 additions & 26 deletions lib/rdoc/parser/prism_ruby.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

require 'prism'
require_relative 'ripper_state_lex'
require_relative 'tokenizer'

# Unlike lib/rdoc/parser/ruby.rb, this file is not based on rtags and does not contain code from
# rtags.rb -
Expand Down Expand Up @@ -89,10 +89,13 @@ def record_location(container) # :nodoc:
# Scans this Ruby file for Ruby constructs

def scan
@tokens = RDoc::Parser::RipperStateLex.parse(@content)
@lines = @content.lines
result = Prism.parse(@content)
@program_node = result.value
result = Prism.parse_lex(@content)
@prism_comments = result.comments
@program_node, unordered_tokens = result.value
# Heredoc tokens are not in start_offset order.
# Need to sort them to use bsearch for finding tokens by location.
@prism_tokens = unordered_tokens.map(&:first).sort_by { |t| t.location.start_offset }
@line_nodes = {}
prepare_line_nodes(@program_node)
prepare_comments(result.comments)
Expand Down Expand Up @@ -205,7 +208,7 @@ def parse_comment_tomdoc(container, comment, line_no, start_line)

meth.start_collecting_tokens(:ruby)
node = @line_nodes[line_no]
tokens = node ? visible_tokens_from_location(node.location) : [file_line_comment_token(start_line)]
tokens = node ? visible_tokens_from_node(node) : [file_line_comment_token(start_line)]
tokens.each { |token| meth.token_stream << token }

container.add_method meth
Expand Down Expand Up @@ -273,7 +276,7 @@ def handle_meta_method_comment(comment, directives, node)
elsif line_no || node
method_name ||= call_node_name_arguments(node).first if is_call_node
if node
tokens = visible_tokens_from_location(node.location)
tokens = visible_tokens_from_node(node)
line_no = node.location.start_line
else
tokens = [file_line_comment_token(line_no)]
Expand Down Expand Up @@ -368,30 +371,21 @@ def parse_comment_text_to_directives(comment_text, start_line) # :nodoc:
[comment, directives]
end

def slice_tokens(start_pos, end_pos) # :nodoc:
start_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> start_pos) >= 0 }
end_index = @tokens.bsearch_index { |t| ([t.line_no, t.char_no] <=> end_pos) >= 0 }
tokens = @tokens[start_index...end_index]
tokens.pop if tokens.last&.kind == :on_nl
tokens
end

def file_line_comment_token(line_no) # :nodoc:
position_comment = RDoc::Parser::RipperStateLex::Token.new(line_no - 1, 0, :on_comment)
position_comment[:text] = "# File #{@top_level.relative_name}, line #{line_no}"
position_comment
text = "# File #{@top_level.relative_name}, line #{line_no}"
RDoc::TokenStream::RipperStateLexCompatToken.new(:on_comment, text)
end

# Returns tokens from the given location
# Returns tokens of the given node's location for syntax highlighting

def visible_tokens_from_location(location)
def visible_tokens_from_node(node)
location = node.location
position_comment = file_line_comment_token(location.start_line)
newline_token = RDoc::Parser::RipperStateLex::Token.new(0, 0, :on_nl, "\n")
indent_token = RDoc::Parser::RipperStateLex::Token.new(location.start_line, 0, :on_sp, ' ' * location.start_character_column)
tokens = slice_tokens(
[location.start_line, location.start_character_column],
[location.end_line, location.end_character_column]
)
newline_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_nl, "\n")
indent_token = RDoc::TokenStream::RipperStateLexCompatToken.new(:on_sp, ' ' * location.start_character_column)
tokens = RDoc::Parser::Tokenizer.partial_tokenize(@content, node, @prism_tokens, @prism_comments).map do |type, text|
RDoc::TokenStream::RipperStateLexCompatToken.new(type, text)
end
[position_comment, newline_token, indent_token, *tokens]
end

Expand Down Expand Up @@ -894,7 +888,7 @@ def visit_def_node(node)
end
name = node.name.to_s
params, block_params, calls_super = MethodSignatureVisitor.scan_signature(node)
tokens = @scanner.visible_tokens_from_location(node.location)
tokens = @scanner.visible_tokens_from_node(node)

@scanner.add_method(
name,
Expand Down
243 changes: 243 additions & 0 deletions lib/rdoc/parser/tokenizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
require 'prism'
require 'set'

# Tokenize Ruby code as RDoc::Parser::RipperStateLex style types and token squashing.
# Token squashing is required by RDoc::TokenStream's syntax highlighting.
module RDoc::Parser::Tokenizer
# This constants and token type map are for compatibility with RDoc::Parser::RipperStateLex.
OTHER = :other
SPACE = :on_sp
NEWLINE = :on_nl
KEYWORD = :on_kw
OP = :on_op
HEREDOC_BEG = :on_heredoc_beg
HEREDOC_CONTENT = :on_heredoc
HEREDOC_END = :on_heredoc_end
COMMENT = :on_comment
INTEGER = :on_int
FLOAT = :on_float
RATIONAL = :on_rational
IMAGINARY = :on_imaginary
SYMBOL = :on_symbol
REGEXP = :on_regexp
STRING = :on_tstring
WORDS = :on_dstring
DEF_METHOD_NAME = :on_ident
DSTRING = :on_dstring

OP_TOKENS = %i[
AMPERSAND AMPERSAND_AMPERSAND
BANG BANG_EQUAL BANG_TILDE CARET COLON COLON_COLON
EQUAL EQUAL_EQUAL EQUAL_GREATER EQUAL_TILDE
GREATER GREATER_GREATER
LESS LESS_EQUAL LESS_EQUAL_GREATER LESS_LESS
MINUS MINUS_GREATER PERCENT PIPE PIPE_PIPE PLUS
QUESTION_MARK SLASH STAR STAR_STAR TILDE
UAMPERSAND UMINUS UPLUS USTAR USTAR_STAR
].to_set

TOKEN_TYPE_MAP = {
IDENTIFIER: :on_ident,
METHOD_NAME: :on_ident,
INSTANCE_VARIABLE: :on_ivar,
CLASS_VARIABLE: :on_cvar,
GLOBAL_VARIABLE: :on_gvar,
BACK_REFERENCE: :on_backref,
NUMBERED_REFERENCE: :on_backref,
CONSTANT: :on_const,
LABEL: :on_label,
INTEGER: :on_int,
FLOAT: :on_float,
RATIONAL: :on_rational,
IMAGINARY: :on_imaginary,
}

class << self
def tokenize(code)
result = Prism.parse_lex(code)
program_node, unordered_tokens = result.value
prism_tokens = unordered_tokens.map(&:first).sort_by! { |token| token.location.start_offset }
partial_tokenize(code, program_node, prism_tokens, result.comments, 0, code.bytesize)
end

def partial_tokenize(whole_code, node, prism_tokens, prism_comments, start_offset = nil, end_offset = nil)
start_offset ||= node.location.start_offset
end_offset ||= node.location.end_offset
visitor = SquashTokenVisitor.new
node.accept(visitor)
squashed_tokens = visitor.tokens
comment_tokens = comment_tokens(slice_by_location(prism_comments, start_offset, end_offset))
normal_tokens = normal_tokens(slice_by_location(prism_tokens, start_offset, end_offset))
prior_tokens = (squashed_tokens + comment_tokens).sort_by {|_, start_offset, _| start_offset }
unify_tokens(whole_code, prior_tokens, normal_tokens, start_offset, end_offset)
end

private

def slice_by_location(items, start_offset, end_offset)
start_index = items.bsearch_index { |item| item.location.end_offset > start_offset } || items.size
end_index = items.bsearch_index { |item| item.location.start_offset >= end_offset } || items.size
items[start_index...end_index]
end

# Unify prior tokens and normal tokens into a token stream.
# Prior tokens have higher priority than normal tokens.
# Also adds missing text (spaces, newlines, etc.) as separate tokens
# so that the entire code is covered.
def unify_tokens(code, prior_tokens, normal_tokens, start_offset, end_offset)
tokens = []
offset = start_offset

# Add missing text such as spaces and newlines as a separate token
flush = -> next_offset {
return if offset == next_offset

code.byteslice(offset...next_offset).scan(/\n|\s+|[^\s]+/) do |text|
type =
if text == "\n"
NEWLINE
elsif /\A\s+\z/.match?(text)
SPACE
else
OTHER
end
tokens << [type, text]
end
}

until prior_tokens.empty? && normal_tokens.empty?
ptok = prior_tokens.first
ntok = normal_tokens.first
if ntok && (!ptok || ntok[2] <= ptok[1])
token = normal_tokens.shift
else
token = prior_tokens.shift
end
type, start_pos, end_pos = token
next if start_pos < offset

flush.call(start_pos)
tokens << [type, code.byteslice(start_pos...end_pos)]
offset = end_pos
end
flush.call(end_offset)
tokens
end

# Extract normal comment and embdoc comment (consists of multiple tokens) as a single token
def comment_tokens(comments)
comments.map do |comment|
[COMMENT, comment.location.start_offset, comment.location.end_offset]
end
end

# Convert normal Prism tokens to [type, start_offset, end_offset]
def normal_tokens(tokens)
tokens.map do |token,|
type =
if token.type.start_with?('KEYWORD_')
KEYWORD
elsif OP_TOKENS.include?(token.type.to_sym)
OP
else
TOKEN_TYPE_MAP[token.type] || OTHER
end
[type, token.location.start_offset, token.location.end_offset]
end
end
end

# Visitor to squash several tokens that consist a single node into a single token
class SquashTokenVisitor < Prism::Visitor
attr_reader :tokens
def initialize
@tokens = []
end

# Squash UMINUS and its operand(integer, float, rational, imaginary) token into a single token
def visit_integer_node(node)
push_location(node.location, INTEGER)
end

def visit_float_node(node)
push_location(node.location, FLOAT)
end

def visit_rational_node(node)
push_location(node.location, RATIONAL)
end

def visit_imaginary_node(node)
push_location(node.location, IMAGINARY)
end

def visit_symbol_node(node)
push_location(node.location, SYMBOL)
end
alias visit_interpolated_symbol_node visit_symbol_node

def visit_regular_expression_node(node)
push_location(node.location, REGEXP)
end
alias visit_match_last_line_node visit_regular_expression_node
alias visit_interpolated_regular_expression_node visit_regular_expression_node
alias visit_interpolated_match_last_line_node visit_regular_expression_node

def visit_string_node(node)
# opening of StringNode inside InterpolatedStringNode might be nil
if node.opening&.start_with?('<<')
push_location(node.opening_loc, HEREDOC_BEG)
push_location(node.content_loc, HEREDOC_CONTENT)
push_location(node.closing_loc, HEREDOC_END)
else
push_location(node.location, STRING)
end
end
alias visit_x_string_node visit_string_node

def visit_array_node(node)
# Right hand side of `a = 1,2` is an array node without opening
if node.opening&.start_with?('%')
# Percent array: squash entire node into a single token.
# We don't handle embedded expressions inside yet.
push_location(node.location, WORDS)
else
super
end
end

def push_location(location, type)
@tokens << [type, location.start_offset, location.end_offset]
end

def visit_def_node(node)
# For special colorizing of method name in def node
push_location(node.name_loc, DEF_METHOD_NAME)
super
end

def visit_interpolated_string_node(node)
# `"a" "b"` is an interpolated string node without opening
if node.opening&.start_with?('<<')
# Heredocs. Squash content into a single token.
# We don't tokenize embedded expressions inside, and don't handle nested heredocs yet.
push_location(node.opening_loc, HEREDOC_BEG)
unless node.parts.empty?
# Squash heredoc content into a single token
part_locations = node.parts.map(&:location)
@tokens << [
HEREDOC_CONTENT,
part_locations.map(&:start_offset).min,
part_locations.map(&:end_offset).max
]
end
# incomplete heredoc might not have closing_loc
push_location(node.closing_loc, HEREDOC_END) if node.closing_loc
else
# Squash entire node into a single token
push_location(node.location, DSTRING)
end
end
alias visit_interpolated_x_string_node visit_interpolated_string_node
end
end
Loading