This guide walks through adding support for a new programming language to Semfora.
Semfora uses tree-sitter for parsing. Adding a new language requires:
- Adding the tree-sitter grammar crate
- Defining the
Langenum variant - Creating AST node mappings (
LangGrammar) - (Optional) Creating a dedicated detector for special features
- Wiring up the dispatcher
Find the tree-sitter grammar for your language on crates.io. Most follow the naming pattern tree-sitter-{language}.
Cargo.toml:
[dependencies]
tree-sitter-ruby = "0.21" # Example for RubyCommon grammars:
tree-sitter-rust,tree-sitter-go,tree-sitter-pythontree-sitter-typescript(includes TypeScript and TSX)tree-sitter-java,tree-sitter-kotlin-ngtree-sitter-c,tree-sitter-cpp
src/lang.rs:
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Lang {
// ... existing variants ...
Ruby, // Add new variant
}impl Lang {
pub fn from_extension(ext: &str) -> Result<Self> {
match ext.to_lowercase().as_str() {
// ... existing mappings ...
"rb" | "rake" | "gemspec" => Ok(Self::Ruby),
_ => Err(McpDiffError::UnsupportedLanguage {
extension: ext.to_string(),
}),
}
}
}impl Lang {
pub fn tree_sitter_language(&self) -> Language {
match self {
// ... existing mappings ...
Self::Ruby => tree_sitter_ruby::LANGUAGE.into(),
}
}
}impl Lang {
pub fn family(&self) -> LangFamily {
match self {
// ... existing mappings ...
Self::Ruby => LangFamily::Ruby, // Or use existing family if similar
}
}
}If creating a new family:
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum LangFamily {
// ... existing families ...
Ruby,
}src/detectors/grammar.rs:
The LangGrammar struct defines AST node mappings for semantic extraction:
pub static RUBY_GRAMMAR: LangGrammar = LangGrammar {
name: "ruby",
// Symbol detection - what AST nodes represent functions/classes?
function_nodes: &["method", "singleton_method"],
class_nodes: &["class", "singleton_class"],
interface_nodes: &["module"], // Ruby modules
enum_nodes: &[],
// Control flow - what nodes create branches?
control_flow_nodes: &[
"if", "unless", "case", "when",
"while", "until", "for",
],
try_nodes: &["begin"], // Ruby's begin/rescue/end
// State changes
var_declaration_nodes: &["assignment"],
assignment_nodes: &["assignment", "operator_assignment"],
// Function calls
call_nodes: &["call", "method_call"],
await_nodes: &[], // Ruby doesn't have await
// Imports
import_nodes: &["require", "require_relative"],
// Field names for child access
name_field: "name",
value_field: "right",
type_field: "type",
body_field: "body",
params_field: "parameters",
condition_field: "condition",
// Visibility detection
is_exported: ruby_is_exported,
uppercase_is_export: false,
visibility_modifiers: &["public", "private", "protected"],
decorator_nodes: &[],
};Create a function to determine if a symbol is exported/public:
/// Ruby: methods after `private` are private
pub fn ruby_is_exported(node: &Node, source: &str) -> bool {
// Ruby visibility is complex - check for preceding private/protected
// Simplified: assume public unless proven otherwise
true
}Use tree-sitter CLI to explore the grammar:
# Install tree-sitter CLI
cargo install tree-sitter-cli
# Parse a file and print the AST
tree-sitter parse example.rb
# Output shows node kinds:
# (program
# (method
# name: (identifier)
# parameters: (method_parameters)
# body: (...)
# )
# )Or use the --print-ast flag in semfora-engine:
semfora-engine example.rb --print-astsrc/extract.rs:
pub fn extract(path: &Path, source: &str, tree: &Tree, lang: Lang) -> Result<SemanticSummary> {
match lang.family() {
// ... existing families ...
LangFamily::Ruby => {
generic::extract_with_grammar(path, source, tree, &grammar::RUBY_GRAMMAR)
}
}
}For languages with complex features (frameworks, decorators, etc.), create a dedicated detector:
src/detectors/ruby.rs:
//! Ruby-specific semantic extraction
use crate::schema::SemanticSummary;
use tree_sitter::Tree;
use std::path::Path;
pub fn extract(path: &Path, source: &str, tree: &Tree) -> crate::error::Result<SemanticSummary> {
// Start with generic extraction
let mut summary = super::generic::extract_with_grammar(
path, source, tree, &super::grammar::RUBY_GRAMMAR
)?;
// Add Ruby-specific features
detect_rails_patterns(&mut summary, source, tree);
detect_rspec_tests(&mut summary, source, tree);
Ok(summary)
}
fn detect_rails_patterns(summary: &mut SemanticSummary, source: &str, tree: &Tree) {
// Detect Rails controllers, models, concerns, etc.
}
fn detect_rspec_tests(summary: &mut SemanticSummary, source: &str, tree: &Tree) {
// Detect RSpec describe/it blocks
}src/detectors/mod.rs:
pub mod ruby;src/extract.rs:
LangFamily::Ruby => detectors::ruby::extract(path, source, tree),src/detectors/ruby.rs:
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ruby_method_extraction() {
let source = r#"
class User
def initialize(name)
@name = name
end
def greet
puts "Hello, #{@name}!"
end
end
"#;
let path = Path::new("user.rb");
let mut parser = tree_sitter::Parser::new();
parser.set_language(&tree_sitter_ruby::LANGUAGE.into()).unwrap();
let tree = parser.parse(source, None).unwrap();
let summary = extract(path, source, &tree).unwrap();
assert_eq!(summary.symbols.len(), 3); // class + 2 methods
assert!(summary.symbols.iter().any(|s| s.name == "User"));
assert!(summary.symbols.iter().any(|s| s.name == "initialize"));
assert!(summary.symbols.iter().any(|s| s.name == "greet"));
}
}README.md - Add to the supported languages table:
| **Ruby** | `.rb`, `.rake`, `.gemspec` | Ruby | Classes, modules, methods; Rails detection |Here's a minimal but complete example:
tree-sitter-ruby = "0.21"pub enum Lang {
// ...
Ruby,
}
impl Lang {
pub fn from_extension(ext: &str) -> Result<Self> {
match ext.to_lowercase().as_str() {
// ...
"rb" | "rake" | "gemspec" => Ok(Self::Ruby),
_ => Err(...)
}
}
pub fn tree_sitter_language(&self) -> Language {
match self {
// ...
Self::Ruby => tree_sitter_ruby::LANGUAGE.into(),
}
}
pub fn family(&self) -> LangFamily {
match self {
// ...
Self::Ruby => LangFamily::Ruby,
}
}
}
pub enum LangFamily {
// ...
Ruby,
}pub fn ruby_is_exported(_node: &Node, _source: &str) -> bool {
true // Simplified
}
pub static RUBY_GRAMMAR: LangGrammar = LangGrammar {
name: "ruby",
function_nodes: &["method", "singleton_method"],
class_nodes: &["class", "singleton_class"],
interface_nodes: &["module"],
enum_nodes: &[],
control_flow_nodes: &["if", "unless", "case", "while", "until", "for"],
try_nodes: &["begin"],
var_declaration_nodes: &["assignment"],
assignment_nodes: &["assignment", "operator_assignment"],
call_nodes: &["call", "method_call"],
await_nodes: &[],
import_nodes: &["call"], // require/require_relative are method calls
name_field: "name",
value_field: "right",
type_field: "type",
body_field: "body",
params_field: "parameters",
condition_field: "condition",
is_exported: ruby_is_exported,
uppercase_is_export: false,
visibility_modifiers: &["public", "private", "protected"],
decorator_nodes: &[],
};LangFamily::Ruby => {
generic::extract_with_grammar(path, source, tree, &grammar::RUBY_GRAMMAR)
}- Start with generic extraction - The
LangGrammar+ generic extractor handles most cases - Use tree-sitter playground - Many grammars have online playgrounds to explore AST structure
- Check existing implementations - Look at similar languages for patterns
- Test with real code - Use actual project files to validate extraction
- Iterate on visibility - Export detection varies widely between languages
- tree-sitter documentation
- Available tree-sitter grammars
- src/detectors/grammar.rs - All grammar definitions
- src/detectors/generic.rs - Generic extractor