Skip to content

Commit 7039c4a

Browse files
authored
Merge pull request #21981 from asgerf/yeast/comments
Yeast/Unified: Extract comments
2 parents 746631d + 6000c18 commit 7039c4a

10 files changed

Lines changed: 208 additions & 20 deletions

File tree

shared/tree-sitter-extractor/src/extractor/mod.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ pub fn extract(
333333
.run_from_tree(&tree, source)
334334
.unwrap_or_else(|e| panic!("Desugaring failed for {path_str}: {e}"));
335335
traverse_yeast(&ast, &mut visitor);
336+
// Comments and other `extra` nodes are not represented in the desugared
337+
// AST, so recover them directly from the original parse tree.
338+
traverse_extras(&tree, &mut visitor);
336339
} else {
337340
traverse(&tree, &mut visitor);
338341
}
@@ -365,6 +368,8 @@ struct Visitor<'a> {
365368
ast_node_parent_table_name: String,
366369
/// Language-specific name of the tokeninfo table
367370
tokeninfo_table_name: String,
371+
/// Language-specific name of the trivia tokeninfo table
372+
trivia_tokeninfo_table_name: String,
368373
/// A lookup table from type name to node types
369374
schema: &'a NodeTypeMap,
370375
/// A stack for gathering information from child nodes. Whenever a node is
@@ -395,11 +400,33 @@ impl<'a> Visitor<'a> {
395400
ast_node_location_table_name: format!("{language_prefix}_ast_node_location"),
396401
ast_node_parent_table_name: format!("{language_prefix}_ast_node_parent"),
397402
tokeninfo_table_name: format!("{language_prefix}_tokeninfo"),
403+
trivia_tokeninfo_table_name: format!("{language_prefix}_trivia_tokeninfo"),
398404
schema,
399405
stack: Vec::new(),
400406
}
401407
}
402408

409+
/// Emits a `TriviaToken` for the given `extra` node (e.g. a comment) from
410+
/// the original parse tree. Trivia tokens carry a location and their source
411+
/// text, but are not attached to a parent in the (possibly desugared) AST.
412+
fn emit_trivia_token(&mut self, node: &Node) {
413+
let id = self.trap_writer.fresh_id();
414+
let loc = location_for(self, self.file_label, node);
415+
let loc_label = location_label(self.trap_writer, loc);
416+
self.trap_writer.add_tuple(
417+
&self.ast_node_location_table_name,
418+
vec![trap::Arg::Label(id), trap::Arg::Label(loc_label)],
419+
);
420+
self.trap_writer.add_tuple(
421+
&self.trivia_tokeninfo_table_name,
422+
vec![
423+
trap::Arg::Label(id),
424+
trap::Arg::Int(node.kind_id() as usize),
425+
sliced_source_arg(self.source, node),
426+
],
427+
);
428+
}
429+
403430
fn record_parse_error(&mut self, loc: trap::Label, mesg: &diagnostics::DiagnosticMessage) {
404431
self.diagnostics_writer.write(mesg);
405432
let id = self.trap_writer.fresh_id();
@@ -835,6 +862,24 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
835862
}
836863
}
837864

865+
/// Walks the original tree-sitter tree and emits a `TriviaToken` for every
866+
/// `extra` node (e.g. a comment). Used to preserve comments that would
867+
/// otherwise be lost after a desugaring pass rewrites the tree.
868+
fn traverse_extras(tree: &Tree, visitor: &mut Visitor) {
869+
emit_extras_in(visitor, tree.root_node());
870+
}
871+
872+
fn emit_extras_in(visitor: &mut Visitor, node: Node<'_>) {
873+
let mut cursor = node.walk();
874+
for child in node.children(&mut cursor) {
875+
if child.is_extra() {
876+
visitor.emit_trivia_token(&child);
877+
} else {
878+
emit_extras_in(visitor, child);
879+
}
880+
}
881+
}
882+
838883
fn traverse_yeast(tree: &yeast::Ast, visitor: &mut Visitor) {
839884
use yeast::Cursor;
840885
let mut cursor = tree.walk();

shared/tree-sitter-extractor/src/generator/mod.rs

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,12 @@ pub fn generate(
6868
let node_parent_table_name = format!("{}_ast_node_parent", &prefix);
6969
let token_name = format!("{}_token", &prefix);
7070
let tokeninfo_name = format!("{}_tokeninfo", &prefix);
71+
let trivia_token_name = format!("{}_trivia_token", &prefix);
72+
let trivia_tokeninfo_name = format!("{}_trivia_tokeninfo", &prefix);
7173
let reserved_word_name = format!("{}_reserved_word", &prefix);
74+
// When a desugaring is configured, comments and other `extra` nodes are
75+
// preserved from the original parse tree as `TriviaToken`s.
76+
let has_trivia_tokens = language.desugar.is_some();
7277
let effective_node_types: String = match language
7378
.desugar
7479
.as_ref()
@@ -85,28 +90,35 @@ pub fn generate(
8590
let nodes = node_types::read_node_types_str(&prefix, &effective_node_types)?;
8691
let (dbscheme_entries, mut ast_node_members, token_kinds) = convert_nodes(&nodes);
8792
ast_node_members.insert(&token_name);
93+
if has_trivia_tokens {
94+
ast_node_members.insert(&trivia_token_name);
95+
}
8896
writeln!(&mut dbscheme_writer, "/*- {} dbscheme -*/", language.name)?;
8997
dbscheme::write(&mut dbscheme_writer, &dbscheme_entries)?;
9098
let token_case = create_token_case(&token_name, token_kinds);
91-
dbscheme::write(
92-
&mut dbscheme_writer,
93-
&[
94-
dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)),
95-
dbscheme::Entry::Case(token_case),
96-
dbscheme::Entry::Union(dbscheme::Union {
97-
name: &ast_node_name,
98-
members: ast_node_members,
99-
}),
100-
dbscheme::Entry::Table(create_ast_node_location_table(
101-
&node_location_table_name,
102-
&ast_node_name,
103-
)),
104-
dbscheme::Entry::Table(create_ast_node_parent_table(
105-
&node_parent_table_name,
106-
&ast_node_name,
107-
)),
108-
],
109-
)?;
99+
let mut dbscheme_tail = vec![
100+
dbscheme::Entry::Table(create_tokeninfo(&tokeninfo_name, &token_name)),
101+
dbscheme::Entry::Case(token_case),
102+
];
103+
if has_trivia_tokens {
104+
dbscheme_tail.push(dbscheme::Entry::Table(create_tokeninfo(
105+
&trivia_tokeninfo_name,
106+
&trivia_token_name,
107+
)));
108+
}
109+
dbscheme_tail.push(dbscheme::Entry::Union(dbscheme::Union {
110+
name: &ast_node_name,
111+
members: ast_node_members,
112+
}));
113+
dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_location_table(
114+
&node_location_table_name,
115+
&ast_node_name,
116+
)));
117+
dbscheme_tail.push(dbscheme::Entry::Table(create_ast_node_parent_table(
118+
&node_parent_table_name,
119+
&ast_node_name,
120+
)));
121+
dbscheme::write(&mut dbscheme_writer, &dbscheme_tail)?;
110122

111123
let mut body = vec![
112124
ql::TopLevel::Class(ql_gen::create_ast_node_class(
@@ -116,6 +128,12 @@ pub fn generate(
116128
)),
117129
ql::TopLevel::Class(ql_gen::create_token_class(&token_name, &tokeninfo_name)),
118130
];
131+
if has_trivia_tokens {
132+
body.push(ql::TopLevel::Class(ql_gen::create_trivia_token_class(
133+
&trivia_token_name,
134+
&trivia_tokeninfo_name,
135+
)));
136+
}
119137
// Only emit the ReservedWord class when there are actually unnamed token
120138
// types in the schema (i.e., @{prefix}_reserved_word exists in the dbscheme).
121139
// When converting from a YEAST YAML schema that has no unnamed tokens, this

shared/tree-sitter-extractor/src/generator/ql_gen.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,70 @@ pub fn create_token_class<'a>(token_type: &'a str, tokeninfo: &'a str) -> ql::Cl
199199
}
200200
}
201201

202+
/// Creates the `TriviaToken` class. Trivia tokens (e.g. comments) are
203+
/// `extra` nodes preserved from the original parse tree even when the tree has
204+
/// been rewritten by a desugaring pass. They are not part of the regular
205+
/// `Token` hierarchy because they do not appear in the (possibly desugared)
206+
/// output schema.
207+
pub fn create_trivia_token_class<'a>(
208+
trivia_token_type: &'a str,
209+
trivia_tokeninfo: &'a str,
210+
) -> ql::Class<'a> {
211+
let trivia_tokeninfo_arity = 3; // id, kind, value
212+
let get_value = ql::Predicate {
213+
qldoc: Some(String::from("Gets the source text of this trivia token.")),
214+
name: "getValue",
215+
overridden: false,
216+
is_private: false,
217+
is_final: true,
218+
return_type: Some(ql::Type::String),
219+
formal_parameters: vec![],
220+
body: create_get_field_expr_for_column_storage(
221+
"result",
222+
trivia_tokeninfo,
223+
1,
224+
trivia_tokeninfo_arity,
225+
),
226+
overlay: None,
227+
};
228+
let to_string = ql::Predicate {
229+
qldoc: Some(String::from(
230+
"Gets a string representation of this element.",
231+
)),
232+
name: "toString",
233+
overridden: true,
234+
is_private: false,
235+
is_final: true,
236+
return_type: Some(ql::Type::String),
237+
formal_parameters: vec![],
238+
body: ql::Expression::Equals(
239+
Box::new(ql::Expression::Var("result")),
240+
Box::new(ql::Expression::Dot(
241+
Box::new(ql::Expression::Var("this")),
242+
"getValue",
243+
vec![],
244+
)),
245+
),
246+
overlay: None,
247+
};
248+
ql::Class {
249+
qldoc: Some(String::from(
250+
"A trivia token, such as a comment, preserved from the original parse tree.",
251+
)),
252+
name: "TriviaToken",
253+
is_abstract: false,
254+
supertypes: vec![ql::Type::At(trivia_token_type), ql::Type::Normal("AstNode")]
255+
.into_iter()
256+
.collect(),
257+
characteristic_predicate: None,
258+
predicates: vec![
259+
get_value,
260+
to_string,
261+
create_get_a_primary_ql_class("TriviaToken", false),
262+
],
263+
}
264+
}
265+
202266
// Creates the `ReservedWord` class.
203267
pub fn create_reserved_word_class(db_name: &str) -> ql::Class<'_> {
204268
let class_name = "ReservedWord";

unified/ql/lib/codeql/unified/Ast.qll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,18 @@ module Unified {
6161
override string getAPrimaryQlClass() { result = "Token" }
6262
}
6363

64+
/** A trivia token, such as a comment, preserved from the original parse tree. */
65+
class TriviaToken extends @unified_trivia_token, AstNode {
66+
/** Gets the source text of this trivia token. */
67+
final string getValue() { unified_trivia_tokeninfo(this, _, result) }
68+
69+
/** Gets a string representation of this element. */
70+
final override string toString() { result = this.getValue() }
71+
72+
/** Gets the name of the primary QL class for this element. */
73+
override string getAPrimaryQlClass() { result = "TriviaToken" }
74+
}
75+
6476
/** Gets the file containing the given `node`. */
6577
private @file getNodeFile(@unified_ast_node node) {
6678
exists(@location_default loc | unified_ast_node_location(node, loc) |
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/** Provides classes for working with comments. */
2+
3+
private import unified
4+
5+
/**
6+
* A comment appearing in the source code.
7+
*/
8+
class Comment extends TriviaToken {
9+
// At the moment, comments are the only type trivia token we extract
10+
/**
11+
* Gets the text inside this comment, not counting the delimeters.
12+
*/
13+
string getCommentText() {
14+
result = this.getValue().regexpCapture("//(.*)", 1)
15+
or
16+
result = this.getValue().regexpCapture("(?s)/\\*(.*)\\*/", 1)
17+
}
18+
}

unified/ql/lib/unified.dbscheme

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,13 @@ case @unified_token.kind of
334334
;
335335

336336

337-
@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator
337+
unified_trivia_tokeninfo(
338+
unique int id: @unified_trivia_token,
339+
int kind: int ref,
340+
string value: string ref
341+
);
342+
343+
@unified_ast_node = @unified_apply_pattern | @unified_binary_expr | @unified_block_stmt | @unified_call_expr | @unified_expr_condition | @unified_expr_stmt | @unified_guard_if_stmt | @unified_if_stmt | @unified_lambda_expr | @unified_let_pattern_condition | @unified_member_access_expr | @unified_name_expr | @unified_parameter | @unified_sequence_condition | @unified_token | @unified_top_level | @unified_trivia_token | @unified_tuple_pattern | @unified_unary_expr | @unified_var_pattern | @unified_variable_declaration_stmt | @unified_variable_declarator
338344

339345
unified_ast_node_location(
340346
unique int node: @unified_ast_node ref,

unified/ql/lib/unified.qll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/**
2+
* Provides classes for working with the AST, as well as files and locations.
3+
*/
4+
5+
import codeql.Locations
6+
import codeql.files.FileSystem
7+
import codeql.unified.Ast::Unified
8+
import codeql.unified.Comments
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
| comments.swift:1:1:1:22 | // Hello this is swift | Hello this is swift |
2+
| comments.swift:3:1:6:3 | /*\n * This is a multi-line comment\n * It should be ignored by the parser\n */ | \n * This is a multi-line comment\n * It should be ignored by the parser\n |
3+
| comments.swift:9:5:9:36 | // This is a single-line comment | This is a single-line comment |
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import unified
2+
3+
query predicate comments(Comment c, string text) { text = c.getCommentText() }
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Hello this is swift
2+
3+
/*
4+
* This is a multi-line comment
5+
* It should be ignored by the parser
6+
*/
7+
8+
func hello() {
9+
// This is a single-line comment
10+
print("Hello, world!")
11+
}

0 commit comments

Comments
 (0)