diff --git a/src/cmds/system/grep_cmd.rs b/src/cmds/system/grep_cmd.rs index 3f7d9a327..7af5c74c6 100644 --- a/src/cmds/system/grep_cmd.rs +++ b/src/cmds/system/grep_cmd.rs @@ -8,6 +8,341 @@ use anyhow::{Context, Result}; use regex::Regex; use std::collections::HashMap; +/// Regex dialect requested by the user (matches grep's regex-mode flags). +/// +/// `BRE` is the POSIX default when no dialect flag is given, matching plain +/// `grep` behavior. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum GrepMode { + /// Basic Regular Expression (`-G`/`--basic-regexp` or no flag — grep default). + Bre, + /// Extended Regular Expression (`-E`/`--extended-regexp`). + Ere, + /// Perl-Compatible Regular Expression (`-P`/`--perl-regexp`). + Pcre, + /// Fixed string, no regex (`-F`/`--fixed-strings`). + Fixed, +} + +/// Detects the regex dialect from grep-style flags in `extra_args`. +/// +/// Matches GNU grep semantics: the LAST dialect flag wins (e.g. `-E -F` → Fixed). +/// Dialect chars are also detected inside a bundled short cluster +/// (`^-[A-Za-z]+$`, e.g. `-iE` → Ere, `-Fi` → Fixed); within a cluster the LAST +/// dialect char wins. Long forms (`--extended-regexp` etc.) are matched whole. +/// Scanning stops at a `--` token — operands after it are not flags (F2). +/// No dialect flag → Bre (POSIX/grep default). +fn detect_mode(extra_args: &[String]) -> GrepMode { + let mut mode = GrepMode::Bre; + for arg in extra_args { + if arg == "--" { + break; + } + match arg.as_str() { + "-F" | "--fixed-strings" => mode = GrepMode::Fixed, + "-E" | "--extended-regexp" => mode = GrepMode::Ere, + "-P" | "--perl-regexp" => mode = GrepMode::Pcre, + "-G" | "--basic-regexp" => mode = GrepMode::Bre, + other => { + // Bundled short cluster: single leading dash, letters only, + // no `=`. Scan its chars; last dialect char in the cluster wins. + let bytes = other.as_bytes(); + let is_cluster = bytes.len() >= 2 + && bytes[0] == b'-' + && bytes[1] != b'-' + && bytes[1..].iter().all(|b| b.is_ascii_alphabetic()); + if is_cluster { + for ch in other[1..].chars() { + match ch { + 'E' => mode = GrepMode::Ere, + 'G' => mode = GrepMode::Bre, + 'P' => mode = GrepMode::Pcre, + 'F' => mode = GrepMode::Fixed, + _ => {} + } + } + } + } + } + } + mode +} + +/// Returns true if a real `grep` binary is callable on this system. +/// +/// Uses PATH resolution (no subprocess spawn) so the BRE fast-path stays +/// well under the project's <10ms startup target on Windows, where +/// `Command::spawn` adds 10–30 ms per call. +fn has_grep_binary() -> bool { + crate::core::utils::resolve_binary("grep").is_ok() +} + +/// Translates a POSIX BRE pattern to ripgrep's regex syntax. +/// +/// BRE and rg-regex flip the meaning of `()`, `+`, `?`, `{`, `}`: +/// - in BRE the bare characters are LITERAL and `\(`, `\)`, `\+`, `\?`, +/// `\{`, `\}` are the metacharacters; +/// - in rg-regex it's the inverse — bare characters are metachars and `\(`, +/// `\)`, etc. are literals. +/// +/// `\|` (BRE alternation) is also non-standard; rg uses `|`. +/// +/// This is a best-effort translation used only as a Windows fallback when no +/// real `grep` binary is on PATH. It does not handle every POSIX BRE corner +/// case (e.g. literal `*` at the start of a pattern, character class +/// equivalence semantics). +fn bre_to_rg(pattern: &str) -> String { + // Backslash-aware single-pass tokenizer. We must distinguish: + // * `\(` → BRE metachar (group) → rg `(` + // * `\\(` → literal backslash + literal `(` → rg `\\` + `\(` = `\\\(` + // A naive `String::replace(r"\(", ...)` cannot tell these apart because it + // matches the `\(` inside `\\(`. We track backslash parity explicitly. + // + // BRE metachars that flip meaning vs rg: `( ) + ? { } |` (escaped form is + // the metachar in BRE, bare form is the metachar in rg). `.`/`*` keep the + // same meaning, so `\.`/`\*` pass through unchanged as rg literals. + // + // Also: a `*` at the very start of a BRE pattern is a literal (it has + // nothing to quantify) → rg `\*`. + let placeholder = |ch: char| -> Option { + match ch { + '|' => Some('\u{E001}'), + '(' => Some('\u{E002}'), + ')' => Some('\u{E003}'), + '+' => Some('\u{E004}'), + '?' => Some('\u{E005}'), + '{' => Some('\u{E006}'), + '}' => Some('\u{E007}'), + _ => None, + } + }; + + let chars: Vec = pattern.chars().collect(); + let mut buf = String::with_capacity(chars.len() + 8); + let mut i = 0; + let mut at_start = true; + while i < chars.len() { + let ch = chars[i]; + if ch == '\\' { + if let Some(&next) = chars.get(i + 1) { + if next == '\\' { + // Escaped backslash: literal `\` in BRE. Emit a literal + // backslash for rg (`\\`); the following char is processed + // on its own in the next iteration (so `\\(` → `\\` then + // bare `(` which Pass 2 escapes to `\(`, giving `\\\(`). + buf.push('\\'); + buf.push('\\'); + i += 2; + at_start = false; + continue; + } + if let Some(ph) = placeholder(next) { + // BRE metachar (`\(`, `\|`, `\+`, ...) → placeholder. + buf.push(ph); + i += 2; + at_start = false; + continue; + } + // Other escape (`\.`, `\*`, `\d`, ...) → pass through verbatim. + buf.push('\\'); + buf.push(next); + i += 2; + at_start = false; + continue; + } + // Trailing lone backslash → keep as-is. + buf.push('\\'); + i += 1; + at_start = false; + continue; + } + + // Leading bare `*` is a BRE literal. + if ch == '*' && at_start { + buf.push('\u{E008}'); // literal-star placeholder + } else { + buf.push(ch); + } + i += 1; + at_start = false; + } + + // Pass 2: escape bare rg metachars so they match as literals (BRE's "bare + // character is literal" semantics). Placeholders are private-use chars so + // they are untouched here. + let mut out = String::with_capacity(buf.len() + 8); + for ch in buf.chars() { + match ch { + '(' | ')' | '+' | '?' | '{' | '}' => { + out.push('\\'); + out.push(ch); + } + _ => out.push(ch), + } + } + + // Pass 3: placeholders → rg metachars (and literal-star → `\*`). + let mut final_out = String::with_capacity(out.len() + 4); + for ch in out.chars() { + if ch == '\u{E008}' { + final_out.push('\\'); + final_out.push('*'); + } else if let Some(m) = bre_placeholder_to_rg_opt(ch) { + final_out.push(m); + } else { + final_out.push(ch); + } + } + final_out +} + +/// Maps a private-use placeholder back to its rg metachar, or `None` if the +/// char is not a placeholder. +fn bre_placeholder_to_rg_opt(ph: char) -> Option { + match ph { + '\u{E001}' => Some('|'), + '\u{E002}' => Some('('), + '\u{E003}' => Some(')'), + '\u{E004}' => Some('+'), + '\u{E005}' => Some('?'), + '\u{E006}' => Some('{'), + '\u{E007}' => Some('}'), + _ => None, + } +} + +/// Builds the rg command for non-BRE modes (or the BRE Windows fallback). +/// +/// `extra_rg_flags` are mode-specific flags such as `--pcre2` or `-F`. +/// `strip_grep_only` removes flags that grep understands but rg does not. +fn build_rg_command( + pattern: &str, + path: &str, + file_type: Option<&str>, + extra_args: &[String], + extra_rg_flags: &[&str], +) -> std::process::Command { + let mut cmd = resolved_command("rg"); + // --no-ignore-vcs: match grep -r behavior (don't skip .gitignore'd files). + // Without this, rg returns 0 matches for files in .gitignore, causing + // false negatives that make AI agents draw wrong conclusions. + // Using --no-ignore-vcs (not --no-ignore) so .ignore/.rgignore are still respected. + // + // -H: always emit the filename, even for single-file invocations. + // -0: NUL-separate filename from `line:content`. NUL cannot appear in + // paths, so the parser stays unambiguous regardless of content with `:` + // or paths with embedded `:` (Windows drive letters) — issue #1436. + cmd.args(["-nH0", "--no-heading", "--no-ignore-vcs"]); + cmd.args(extra_rg_flags); + cmd.arg(pattern).arg(path); + + if let Some(ft) = file_type { + cmd.arg("--type").arg(ft); + } + + for arg in extra_args { + // Strip grep-only flags that rg either ignores, doesn't recognize, or + // overloads with a different meaning. + if matches!( + arg.as_str(), + "-r" | "--recursive" + | "-E" + | "--extended-regexp" + | "-G" + | "--basic-regexp" + | "-P" + | "--perl-regexp" + | "-F" + | "--fixed-strings" + ) { + continue; + } + // Dialect chars inside a bundled short cluster (e.g. `-iE`) must be + // stripped before forwarding: the dialect is already applied via + // `extra_rg_flags`, and rg would mis-read a forwarded `-E`/`-P` (which + // in rg take a value: --encoding / --pre) and fail with "missing value". + if let Some(sanitized) = strip_dialect_from_cluster(arg) { + if sanitized != "-" { + cmd.arg(sanitized); + } + // A bare "-" (all chars were dialect chars) is dropped entirely. + continue; + } + cmd.arg(arg); + } + cmd +} + +/// If `arg` is a bundled short cluster (`^-[A-Za-z]+$`), returns a copy with the +/// dialect chars (`E`/`G`/`P`/`F`) removed, preserving the other short flags +/// (e.g. `-iE` → `-i`, `-EF` → `-`). Returns `None` for non-cluster tokens so +/// the caller forwards them unchanged. +fn strip_dialect_from_cluster(arg: &str) -> Option { + let bytes = arg.as_bytes(); + let is_cluster = bytes.len() >= 2 + && bytes[0] == b'-' + && bytes[1] != b'-' + && bytes[1..].iter().all(|b| b.is_ascii_alphabetic()); + if !is_cluster { + return None; + } + if !arg[1..].chars().any(|c| matches!(c, 'E' | 'G' | 'P' | 'F')) { + // Cluster with no dialect char — forward verbatim. + return None; + } + let kept: String = arg[1..] + .chars() + .filter(|c| !matches!(c, 'E' | 'G' | 'P' | 'F')) + .collect(); + Some(format!("-{}", kept)) +} + +/// Maps an rg-style file type name to a grep `--include` glob. +/// +/// grep has no `--type` flag, so the type filter must be expressed as a glob. +/// Known common types are mapped explicitly; anything else falls back to +/// `*.` (so `rtk grep -t foo` searches `*.foo`). +fn grep_include_glob(file_type: &str) -> String { + let ext = match file_type { + "rust" => "rs", + "ts" => "ts", + "py" => "py", + "go" => "go", + "js" => "js", + other => other, + }; + format!("*.{}", ext) +} + +/// Builds the grep command for the BRE-via-real-grep path. +fn build_grep_command( + pattern: &str, + path: &str, + file_type: Option<&str>, + extra_args: &[String], +) -> std::process::Command { + let mut cmd = resolved_command("grep"); + // -r: recursive (grep is NOT recursive by default; rg is). + // -n: line numbers. + // -H: always emit the filename so the parser sees a uniform shape. + // -Z: NUL-separate filename from `line:content` (parity with rg's -0) + // so the parser handles content with `:` and paths with embedded `:` + // (issue #1436). + cmd.args(["-rnHZ"]); + // F6: honor the file-type filter. grep has no --type, so translate to an + // --include glob. Without this, `rtk grep -t rust foo` silently searched + // all files on the grep path. + if let Some(ft) = file_type { + cmd.arg(format!("--include={}", grep_include_glob(ft))); + } + cmd.args([pattern, path]); + for arg in extra_args { + // Pass user flags through verbatim — grep understands them. + cmd.arg(arg); + } + cmd +} + #[allow(clippy::too_many_arguments)] pub fn run( pattern: &str, @@ -25,39 +360,67 @@ pub fn run( eprintln!("grep: '{}' in {}", pattern, path); } - // Fix: convert BRE alternation \| → | for rg (which uses PCRE-style regex) - let rg_pattern = pattern.replace(r"\|", "|"); + // Mode-aware routing for issue #1436 (BRE→regex semantics): + // * BRE (default or -G): delegate to real `grep` for exact POSIX + // semantics. Slower than rg, but eliminates the regex-translation + // surface area (literal `()` vs grouping `\(\)`, `\|` alternation, + // `\+`/`\?`/`\{` interval, etc.). On Windows without grep on PATH, + // fall back to rg with a best-effort BRE→rg-regex translation. + // * ERE (-E): rg's regex dialect handles ERE-style alternation and + // grouping natively. Pass through, fast. + // * PCRE (-P): rg --pcre2. Exact PCRE semantics, fast. + // * Fixed (-F): rg -F. No regex parsing at all, fast. + let mode = detect_mode(extra_args); - let mut rg_cmd = resolved_command("rg"); - // --no-ignore-vcs: match grep -r behavior (don't skip .gitignore'd files). - // Without this, rg returns 0 matches for files in .gitignore, causing - // false negatives that make AI agents draw wrong conclusions. - // Using --no-ignore-vcs (not --no-ignore) so .ignore/.rgignore are still respected. - // -H: always emit the filename. - // -0: NUL-separate filename. Allows the parser to disambiguate filenames or - // content containing `:digits:` patterns (issue #1436). - rg_cmd.args(["-nH0", "--no-heading", "--no-ignore-vcs", &rg_pattern, path]); - - if let Some(ft) = file_type { - rg_cmd.arg("--type").arg(ft); - } - - for arg in extra_args { - // Fix: skip grep-ism -r flag (rg is recursive by default; rg -r means --replace) - if arg == "-r" || arg == "--recursive" { - continue; + let result = match mode { + GrepMode::Bre if has_grep_binary() => { + exec_capture(&mut build_grep_command(pattern, path, file_type, extra_args)) + .context("grep failed")? } - rg_cmd.arg(arg); - } - - let result = exec_capture(&mut rg_cmd) - .or_else(|_| { - let mut grep_cmd = resolved_command("grep"); - // When we fall back to grep, include all args, not just -rnHZ. - grep_cmd.args(["-rnHZ", pattern, path]).args(extra_args); - exec_capture(&mut grep_cmd) - }) - .context("grep/rg failed")?; + GrepMode::Bre => { + // Windows fallback: no grep on PATH. Translate the pattern and + // tell the user once per process so silent semantic drift is + // impossible without polluting stderr in scripted/loop usage. + static FALLBACK_NOTICE: std::sync::OnceLock<()> = std::sync::OnceLock::new(); + FALLBACK_NOTICE.get_or_init(|| { + eprintln!( + "rtk grep: real `grep` not on PATH; using ripgrep with best-effort \ + BRE translation. Some POSIX BRE corner cases may differ. Install \ + grep for exact semantics, or pass -E/-P/-F to use ripgrep directly." + ); + }); + let translated = bre_to_rg(pattern); + exec_capture(&mut build_rg_command( + &translated, + path, + file_type, + extra_args, + &[], + )) + .context("rg failed")? + } + // Non-BRE modes: prefer rg for speed, fall back to grep if rg is + // unavailable. grep already understands -F/-E/-P natively (passed + // through via extra_args by build_grep_command), so the fallback + // preserves the requested dialect. + non_bre_mode => { + let rg_extras: &[&str] = match non_bre_mode { + GrepMode::Pcre => &["--pcre2"], + GrepMode::Fixed => &["-F"], + _ => &[], + }; + exec_capture(&mut build_rg_command( + pattern, path, file_type, extra_args, rg_extras, + )) + .or_else(|rg_err| { + // Preserve the rg failure reason so users debugging + // "rg and grep both failed" can see what each tool said. + eprintln!("rtk grep: rg failed ({}), falling back to grep", rg_err); + exec_capture(&mut build_grep_command(pattern, path, file_type, extra_args)) + }) + .context("rg and grep both failed")? + } + }; // Passthrough output flags that produce output that is already small. if has_format_flag(extra_args) { @@ -306,24 +669,214 @@ mod tests { assert!(!cleaned.is_empty()); } - // Fix: BRE \| alternation is translated to PCRE | for rg + // --- issue #1436: regex-mode detection (Bug 2) --- + + #[test] + fn test_detect_mode_default_is_bre() { + assert_eq!(detect_mode(&[]), GrepMode::Bre); + assert_eq!(detect_mode(&["-i".into(), "-n".into()]), GrepMode::Bre); + } + + #[test] + fn test_detect_mode_fixed_strings() { + assert_eq!(detect_mode(&["-F".into()]), GrepMode::Fixed); + assert_eq!(detect_mode(&["--fixed-strings".into()]), GrepMode::Fixed); + } + + #[test] + fn test_detect_mode_extended() { + assert_eq!(detect_mode(&["-E".into()]), GrepMode::Ere); + assert_eq!(detect_mode(&["--extended-regexp".into()]), GrepMode::Ere); + } + + #[test] + fn test_detect_mode_perl() { + assert_eq!(detect_mode(&["-P".into()]), GrepMode::Pcre); + assert_eq!(detect_mode(&["--perl-regexp".into()]), GrepMode::Pcre); + } + + #[test] + fn test_detect_mode_explicit_basic() { + assert_eq!(detect_mode(&["-G".into()]), GrepMode::Bre); + assert_eq!(detect_mode(&["--basic-regexp".into()]), GrepMode::Bre); + } + + #[test] + fn test_detect_mode_last_flag_wins() { + // GNU grep uses the LAST dialect flag; rtk now matches that. + let args = vec!["-E".into(), "-F".into()]; + assert_eq!(detect_mode(&args), GrepMode::Fixed); + } + + #[test] + fn test_detect_mode_bundled_cluster() { + // Dialect char inside a bundled short cluster is detected. + assert_eq!(detect_mode(&["-iE".into()]), GrepMode::Ere); + assert_eq!(detect_mode(&["-Fi".into()]), GrepMode::Fixed); + assert_eq!(detect_mode(&["-inE".into()]), GrepMode::Ere); + } + + #[test] + fn test_detect_mode_cluster_last_dialect_char_wins() { + // Within a cluster, the last dialect char wins (E then F → Fixed). + assert_eq!(detect_mode(&["-iEF".into()]), GrepMode::Fixed); + } + + #[test] + fn test_detect_mode_stops_at_double_dash() { + // F2: a dialect-looking token after `--` is an operand, not a flag. + assert_eq!( + detect_mode(&["--".into(), "-E".into()]), + GrepMode::Bre + ); + } + + // --- issue #1436: BRE → rg-regex translation (Windows fallback) --- + + #[test] + fn test_bre_to_rg_alternation() { + // BRE `\|` is alternation; in rg it's literal — translate to `|`. + assert_eq!(bre_to_rg(r"foo\|bar"), "foo|bar"); + } + + #[test] + fn test_bre_to_rg_grouping() { + // BRE `\(...\)` is a group; rg uses `(...)`. + assert_eq!(bre_to_rg(r"\(foo\)\|bar"), "(foo)|bar"); + } + + #[test] + fn test_bre_to_rg_literal_parens() { + // The issue #1436 reproducer: in BRE `delete()` is a literal match + // for `delete()`. rg would treat `()` as an empty group matching + // every `delete`. Translation must escape the bare parens. + assert_eq!(bre_to_rg("delete()"), r"delete\(\)"); + } + + #[test] + fn test_bre_to_rg_mixed_literal_and_group() { + // `\(group\)` becomes `(group)`; bare `+`, `(`, `)` are BRE literals + // and become rg-literal `\+`, `\(`, `\)`. + assert_eq!(bre_to_rg(r"\(g\)+()"), r"(g)\+\(\)"); + } + + #[test] + fn test_bre_to_rg_quantifiers() { + // BRE `\+` and `\?` become rg `+` and `?`. Bare `+` and `?` become + // literals. + assert_eq!(bre_to_rg(r"a\+b\?c"), "a+b?c"); + assert_eq!(bre_to_rg("a+b?c"), r"a\+b\?c"); + } + + #[test] + fn test_bre_to_rg_intervals() { + // BRE `\{n,m\}` becomes rg `{n,m}`. Bare braces become literals. + assert_eq!(bre_to_rg(r"a\{2,5\}"), "a{2,5}"); + assert_eq!(bre_to_rg("{literal}"), r"\{literal\}"); + } + + #[test] + fn test_bre_to_rg_no_op_for_simple_pattern() { + // A pattern with no BRE-specific syntax round-trips unchanged. + assert_eq!(bre_to_rg("ClassRegistry::init"), "ClassRegistry::init"); + assert_eq!(bre_to_rg("foo.*bar"), "foo.*bar"); + assert_eq!(bre_to_rg(""), ""); + } + + // --- F5: escaped-literal hardening --- + + #[test] + fn test_bre_to_rg_escaped_backslash_then_paren() { + // `\\(` = literal backslash + literal `(` in BRE. rg needs the + // backslash escaped (`\\`) and the paren escaped to stay literal + // (`\(`), yielding `\\\(`. A naive replace would mis-read the `\(` + // inside `\\(` as a group. + assert_eq!(bre_to_rg(r"\\("), r"\\\("); + } + + #[test] + fn test_bre_to_rg_escaped_dot_passes_through() { + // `\.` is a literal dot in both BRE and rg → unchanged. + assert_eq!(bre_to_rg(r"a\.b"), r"a\.b"); + } + + #[test] + fn test_bre_to_rg_leading_star_is_literal() { + // A `*` at the start of a BRE pattern has nothing to quantify, so it + // is literal → rg `\*`. A later `*` keeps its quantifier meaning. + assert_eq!(bre_to_rg("*abc"), r"\*abc"); + assert_eq!(bre_to_rg("a*"), "a*"); + } + + // The original issue's exact reproducer pattern. + #[test] + fn test_bre_to_rg_issue_1436_reproducer() { + let pattern = r"private function delete\|try\|catch\|finally\|delete()"; + // Expected: alternation works (`\|` -> `|`), and `delete()`'s parens + // become literal (`\(\)`) so rg matches `delete()` not every `delete`. + assert_eq!( + bre_to_rg(pattern), + r"private function delete|try|catch|finally|delete\(\)" + ); + } + + // --- issue #1436: command builders strip mode flags from rg --- + + #[test] + fn test_build_rg_strips_grep_only_flags() { + // -E/-G/-P/-F and -r are mode/recursive flags rg doesn't want forwarded. + // We can't easily inspect Command's argv, so this is a smoke-build test. + let extra_args: Vec = vec![ + "-E".into(), + "-G".into(), + "-P".into(), + "-F".into(), + "-r".into(), + "--recursive".into(), + "-i".into(), // should pass through + ]; + // Just ensure the builder runs without panicking with this combination. + let _cmd = build_rg_command("pattern", "path", None, &extra_args, &[]); + } + + #[test] + fn test_build_grep_passes_args_verbatim() { + // grep understands all its own flags — the builder must not strip them. + let extra_args: Vec = vec!["-G".into(), "-i".into()]; + let _cmd = build_grep_command("pattern", "path", None, &extra_args); + } + + #[test] + fn test_strip_dialect_from_cluster() { + // Dialect chars removed, other short flags preserved. + assert_eq!(strip_dialect_from_cluster("-iE").as_deref(), Some("-i")); + assert_eq!(strip_dialect_from_cluster("-inE").as_deref(), Some("-in")); + // All-dialect cluster collapses to bare "-" (caller drops it). + assert_eq!(strip_dialect_from_cluster("-EF").as_deref(), Some("-")); + // Cluster without a dialect char → None (forward verbatim). + assert_eq!(strip_dialect_from_cluster("-in"), None); + // Non-cluster tokens → None. + assert_eq!(strip_dialect_from_cluster("-A"), None); // single, but no dialect char + assert_eq!(strip_dialect_from_cluster("--color"), None); + assert_eq!(strip_dialect_from_cluster("pattern"), None); + } + #[test] - fn test_bre_alternation_translated() { - let pattern = r"fn foo\|pub.*bar"; - let rg_pattern = pattern.replace(r"\|", "|"); - assert_eq!(rg_pattern, "fn foo|pub.*bar"); + fn test_build_grep_accepts_file_type() { + // F6: builder accepts a file_type without panicking. + let _cmd = build_grep_command("pattern", "path", Some("rust"), &[]); } - // Fix: -r flag (grep recursive) is stripped from extra_args (rg is recursive by default) #[test] - fn test_recursive_flag_stripped() { - let extra_args: Vec = vec!["-r".to_string(), "-i".to_string()]; - let filtered: Vec<&String> = extra_args - .iter() - .filter(|a| *a != "-r" && *a != "--recursive") - .collect(); - assert_eq!(filtered.len(), 1); - assert_eq!(filtered[0], "-i"); + fn test_grep_include_glob_mapping() { + // F6: known rg-style type names map to the right extension glob. + assert_eq!(grep_include_glob("rust"), "*.rs"); + assert_eq!(grep_include_glob("ts"), "*.ts"); + assert_eq!(grep_include_glob("py"), "*.py"); + assert_eq!(grep_include_glob("go"), "*.go"); + assert_eq!(grep_include_glob("js"), "*.js"); + // Unknown type → literal *. fallback. + assert_eq!(grep_include_glob("toml"), "*.toml"); } // --- truncation accuracy --- diff --git a/src/main.rs b/src/main.rs index 22e6cbca8..24cdb08bb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -322,7 +322,23 @@ enum Commands { /// Show line numbers (always on, accepted for grep/rg compatibility) #[arg(short = 'n', long)] line_numbers: bool, - /// Extra ripgrep arguments (e.g., -i, -A 3, -w, --glob) + /// Extended regex (ERE) — alternation/grouping like grep -E + #[arg(short = 'E', long = "extended-regexp", conflicts_with_all = ["basic_regexp", "perl_regexp", "fixed_strings"])] + extended_regexp: bool, + /// Basic regex (BRE, POSIX default) — explicit grep -G + #[arg(short = 'G', long = "basic-regexp", conflicts_with_all = ["extended_regexp", "perl_regexp", "fixed_strings"])] + basic_regexp: bool, + /// Perl-compatible regex — like grep -P + #[arg(short = 'P', long = "perl-regexp", conflicts_with_all = ["extended_regexp", "basic_regexp", "fixed_strings"])] + perl_regexp: bool, + /// Fixed string match (no regex) — like grep -F + #[arg(short = 'F', long = "fixed-strings", conflicts_with_all = ["extended_regexp", "basic_regexp", "perl_regexp"])] + fixed_strings: bool, + /// Extra ripgrep/grep arguments (e.g., -i, -A 3, -w, --glob). + /// + /// Flags placed BEFORE the pattern are reordered to land here (see + /// `reorder_grep_args`). `-e PATTERN`/`-f FILE` are unsupported: the + /// pattern must be positional, so those forms fall back to raw grep. #[arg(trailing_var_arg = true, allow_hyphen_values = true)] extra_args: Vec, }, @@ -1388,11 +1404,105 @@ where } } +/// Hoists grep positional operands (pattern, then optional path) ahead of any +/// flags so clap parses them as the `pattern`/`path` positionals while the +/// remaining flags land in `extra_args` (which has `trailing_var_arg + +/// allow_hyphen_values`). +/// +/// Without this, clap rejects an UNDEFINED short flag (e.g. `-i`, `-w`, `-A`) +/// that appears BEFORE the `pattern` positional, forcing a raw passthrough with +/// 0% token savings (issue #1436, bug N1). `rtk grep -i apple f` would fall +/// back to raw; `rtk grep apple f -i` would filter correctly. This normalizes +/// the former into the latter. +/// +/// Limitations (documented-unsupported → returns input unchanged, falls back): +/// * `-e PATTERN` / `--regexp PATTERN` and `-f FILE` / `--file FILE` supply +/// the pattern via a flag, so positional reordering is unsafe. +fn reorder_grep_args(args: Vec) -> Vec { + // Value-consuming grep flags whose NEXT token is a value, not a positional. + // Only EXACT matches consume a following token; attached forms like `-A3` + // or `--context=2` are a single token and need no special handling. + const VALUE_FLAGS: &[&str] = &[ + "-A", + "-B", + "-C", + "-m", + "-d", + "--after-context", + "--before-context", + "--context", + "--max-count", + "--color", + "--colour", + ]; + + // Locate the grep subcommand: index `i` where args[i] == "grep" and every + // token in args[1..i] is a global flag (starts with '-'). If none, no-op. + let grep_idx = + args.iter().enumerate().skip(1).position(|(idx, tok)| { + tok == "grep" && args[1..idx].iter().all(|t| t.starts_with('-')) + }); + let Some(rel) = grep_idx else { + return args; + }; + let i = rel + 1; // position() skipped 1, so add it back to get the real index + + let operands = &args[i + 1..]; + + // Bail if the pattern comes from a flag (positional reorder would be wrong). + if operands + .iter() + .any(|t| matches!(t.as_str(), "-e" | "--regexp" | "-f" | "--file")) + { + return args; + } + + let mut positionals: Vec = Vec::new(); + let mut flags: Vec = Vec::new(); + + let mut it = operands.iter(); + while let Some(tok) = it.next() { + if tok == "--" { + // Everything after `--` is a positional operand, verbatim. + for rest in it.by_ref() { + positionals.push(rest.clone()); + } + break; + } else if tok.starts_with('-') { + flags.push(tok.clone()); + if VALUE_FLAGS.contains(&tok.as_str()) { + if let Some(val) = it.next() { + flags.push(val.clone()); + } + } + } else { + positionals.push(tok.clone()); + } + } + + // No pattern found → let clap error / fallback handle the no-pattern case. + if positionals.is_empty() { + return args; + } + + let mut out: Vec = args[..=i].to_vec(); + out.push(positionals[0].clone()); // pattern + if positionals.len() >= 2 { + out.push(positionals[1].clone()); // path + } + out.extend(flags); + if positionals.len() > 2 { + out.extend(positionals[2..].iter().cloned()); + } + out +} + fn run_cli() -> Result { // Fire-and-forget telemetry ping (1/day, non-blocking) core::telemetry::maybe_ping(); - let cli = match Cli::try_parse() { + let reordered = reorder_grep_args(std::env::args().collect()); + let cli = match Cli::try_parse_from(&reordered) { Ok(cli) => cli, Err(e) => { if matches!(e.kind(), ErrorKind::DisplayHelp | ErrorKind::DisplayVersion) { @@ -1788,17 +1898,42 @@ fn run_cli() -> Result { context_only, file_type, line_numbers: _, // no-op: line numbers always enabled in grep_cmd::run + extended_regexp, + basic_regexp, + perl_regexp, + fixed_strings, extra_args, - } => grep_cmd::run( - &pattern, - &path, - max_len, - max, - context_only, - file_type.as_deref(), - &extra_args, - cli.verbose, - )?, + } => { + // Clap rejects unknown short flags before the pattern positional, so + // dialect flags get promoted to first-class args. Re-inject the one + // the user picked into extra_args so detect_mode + build_grep_command + // still see it and route consistently. + let mut extra_args = extra_args; + let dialect_flag = if extended_regexp { + Some("-E") + } else if basic_regexp { + Some("-G") + } else if perl_regexp { + Some("-P") + } else if fixed_strings { + Some("-F") + } else { + None + }; + if let Some(flag) = dialect_flag { + extra_args.insert(0, flag.to_string()); + } + grep_cmd::run( + &pattern, + &path, + max_len, + max, + context_only, + file_type.as_deref(), + &extra_args, + cli.verbose, + )? + } Commands::Init { global, @@ -2518,6 +2653,91 @@ mod tests { use clap::Parser; use std::cell::Cell; + // --- issue #1436: reorder_grep_args (bug N1) --- + + fn rg(args: &[&str]) -> Vec { + reorder_grep_args(args.iter().map(|s| s.to_string()).collect()) + } + + #[test] + fn test_reorder_grep_flag_before_pattern() { + // `-i apple f` → pattern/path hoisted, flag trails. + assert_eq!( + rg(&["rtk", "grep", "-i", "apple", "f"]), + vec!["rtk", "grep", "apple", "f", "-i"] + ); + } + + #[test] + fn test_reorder_grep_flag_after_pattern_unchanged_effect() { + // Already-correct order: pattern first. No flags precede, so positionals + // stay first and the trailing flag stays trailing. + assert_eq!( + rg(&["rtk", "grep", "apple", "f", "-i"]), + vec!["rtk", "grep", "apple", "f", "-i"] + ); + } + + #[test] + fn test_reorder_grep_value_flag_preserved() { + // `-A 3 foo` → the `3` is -A's value, NOT the pattern. Pattern is `foo`. + assert_eq!( + rg(&["rtk", "grep", "-A", "3", "foo"]), + vec!["rtk", "grep", "foo", "-A", "3"] + ); + } + + #[test] + fn test_reorder_grep_bundled_cluster_before_pattern() { + // `-iE 'a|b' f` → pattern hoisted, `-iE` lands in flags. + assert_eq!( + rg(&["rtk", "grep", "-iE", "a|b", "f"]), + vec!["rtk", "grep", "a|b", "f", "-iE"] + ); + } + + #[test] + fn test_reorder_grep_bails_on_e_flag() { + // `-e foo` → pattern via flag, reorder unsafe → unchanged. + assert_eq!( + rg(&["rtk", "grep", "-e", "foo", "f"]), + vec!["rtk", "grep", "-e", "foo", "f"] + ); + } + + #[test] + fn test_reorder_grep_double_dash_separator() { + // After `--`, everything is a positional. `-pat` becomes the pattern. + assert_eq!( + rg(&["rtk", "grep", "-i", "--", "-pat", "f"]), + vec!["rtk", "grep", "-pat", "f", "-i"] + ); + } + + #[test] + fn test_reorder_grep_no_grep_unchanged() { + // Not a grep invocation → unchanged. + assert_eq!(rg(&["rtk", "git", "status"]), vec!["rtk", "git", "status"]); + } + + #[test] + fn test_reorder_grep_global_flags_before_subcommand() { + // Global flags (e.g. -v) may precede the grep subcommand. + assert_eq!( + rg(&["rtk", "-v", "grep", "-i", "apple", "f"]), + vec!["rtk", "-v", "grep", "apple", "f", "-i"] + ); + } + + #[test] + fn test_reorder_grep_attached_value_flag_not_consumed() { + // `-A3` is a single token; it must NOT consume the next token as value. + assert_eq!( + rg(&["rtk", "grep", "-A3", "foo"]), + vec!["rtk", "grep", "foo", "-A3"] + ); + } + #[test] fn test_git_commit_single_message() { let cli = Cli::try_parse_from(["rtk", "git", "commit", "-m", "fix: typo"]).unwrap();