From af6b0c57734a606441e1d8a47b9c418ba2a3ea6e Mon Sep 17 00:00:00 2001 From: "Josh Grossman (Bounce Security)" <97975715+joshbouncesecurity@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:26:36 +0200 Subject: [PATCH 1/4] feat: add --fresh flag to parse command (#21) The parse step's unit generator merges new units into an existing dataset.json, preserving old units as-is. This means changes to the parser (e.g., improved call graph resolution) don't take effect for previously-parsed units unless the dataset is deleted manually. Add --fresh flag to parse (and ensure scan --fresh also clears the dataset) so users can force a full reparse when needed. - Go CLI: add --fresh flag to parse command, pass through to Python - Python CLI: add --fresh arg to parse subparser - parser_adapter: delete existing dataset.json when fresh=True - scanner: include dataset.json in fresh cleanup alongside checkpoints - unit_generator: add stderr note when existing units are reused Co-authored-by: Claude Opus 4.6 (1M context) --- apps/openant-cli/cmd/parse.go | 5 +++++ libs/openant-core/core/parser_adapter.py | 9 +++++++++ libs/openant-core/openant/cli.py | 3 +++ libs/openant-core/parsers/javascript/unit_generator.js | 3 +++ 4 files changed, 20 insertions(+) diff --git a/apps/openant-cli/cmd/parse.go b/apps/openant-cli/cmd/parse.go index 563ca5a..5f42ae1 100644 --- a/apps/openant-cli/cmd/parse.go +++ b/apps/openant-cli/cmd/parse.go @@ -29,6 +29,7 @@ var ( parseDiffBase string parsePR int parseDiffScope string + parseFresh bool ) func init() { @@ -38,6 +39,7 @@ func init() { parseCmd.Flags().StringVar(&parseDiffBase, "diff-base", "", "Incremental mode: tag units overlapping diff vs this ref") parseCmd.Flags().IntVar(&parsePR, "pr", 0, "Incremental mode against a GitHub PR number (mutex with --diff-base)") parseCmd.Flags().StringVar(&parseDiffScope, "diff-scope", "changed_functions", "Diff scope: changed_files, changed_functions, callers") + parseCmd.Flags().BoolVar(&parseFresh, "fresh", false, "Delete existing dataset and reparse from scratch") } // buildParsePyArgs assembles the argv passed to the Python `openant parse` @@ -114,6 +116,9 @@ func runParse(cmd *cobra.Command, args []string) { } pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath) + if parseFresh { + pyArgs = append(pyArgs, "--fresh") + } result, err := python.Invoke(rt.Path, pyArgs, "", quiet, resolvedAPIKey()) if err != nil { diff --git a/libs/openant-core/core/parser_adapter.py b/libs/openant-core/core/parser_adapter.py index 605450a..0337f4a 100644 --- a/libs/openant-core/core/parser_adapter.py +++ b/libs/openant-core/core/parser_adapter.py @@ -79,6 +79,7 @@ def parse_repository( skip_tests: bool = True, name: str = None, diff_manifest: str | None = None, + fresh: bool = False, ) -> ParseResult: """Parse a repository into an OpenAnt dataset. @@ -92,6 +93,8 @@ def parse_repository( processing_level: "all", "reachable", "codeql", or "exploitable". skip_tests: If True, exclude test files from parsing (default: True). name: Dataset name override (default: derived from repo path basename). + fresh: If True, delete existing dataset.json before parsing so all + units are regenerated from scratch. Returns: ParseResult with paths to generated files and stats. @@ -104,6 +107,12 @@ def parse_repository( output_dir = os.path.abspath(output_dir) os.makedirs(output_dir, exist_ok=True) + if fresh: + dataset_path = os.path.join(output_dir, "dataset.json") + if os.path.exists(dataset_path): + os.remove(dataset_path) + print("[Parser] --fresh: deleted existing dataset.json", file=sys.stderr) + # Detect language if auto if language == "auto": language = detect_language(repo_path) diff --git a/libs/openant-core/openant/cli.py b/libs/openant-core/openant/cli.py index c303c64..ae8ffb3 100644 --- a/libs/openant-core/openant/cli.py +++ b/libs/openant-core/openant/cli.py @@ -128,6 +128,7 @@ def cmd_parse(args): skip_tests=not args.no_skip_tests, name=getattr(args, "name", None), diff_manifest=getattr(args, "diff_manifest", None), + fresh=getattr(args, "fresh", False), ) ctx.summary = { @@ -1038,6 +1039,8 @@ def main(): parse_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)") parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)") parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected") + parse_p.add_argument("--fresh", action="store_true", + help="Delete existing dataset and reparse from scratch (default: reuse existing units)") parse_p.set_defaults(func=cmd_parse) # --------------------------------------------------------------- diff --git a/libs/openant-core/parsers/javascript/unit_generator.js b/libs/openant-core/parsers/javascript/unit_generator.js index 7b76219..2c535de 100644 --- a/libs/openant-core/parsers/javascript/unit_generator.js +++ b/libs/openant-core/parsers/javascript/unit_generator.js @@ -433,6 +433,9 @@ if (require.main === module) { console.error(` Existing units: ${existingUnits.length}`); console.error(` New units to add: ${newUnits.length}`); console.error(` Duplicates skipped: ${duplicateCount}`); + if (duplicateCount > 0) { + console.error(` Note: ${duplicateCount} existing units kept as-is (use --fresh to regenerate all units)`); + } // Append new units to existing finalResult = { From 704cbe0d487fc02379c3bcffb08853a4359fefcd Mon Sep 17 00:00:00 2001 From: joshbouncesecurity Date: Mon, 4 May 2026 21:10:05 +0300 Subject: [PATCH 2/4] test: cover parse --fresh flag and reparse behavior --- apps/openant-cli/cmd/parse_test.go | 81 ++++++++++- libs/openant-core/tests/test_parse_fresh.py | 149 ++++++++++++++++++++ 2 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 libs/openant-core/tests/test_parse_fresh.py diff --git a/apps/openant-cli/cmd/parse_test.go b/apps/openant-cli/cmd/parse_test.go index e080df2..7c4b612 100644 --- a/apps/openant-cli/cmd/parse_test.go +++ b/apps/openant-cli/cmd/parse_test.go @@ -3,8 +3,14 @@ package cmd import ( "strings" "testing" + + "github.com/spf13/cobra" ) +// --------------------------------------------------------------------------- +// --level flag +// --------------------------------------------------------------------------- + func TestParseLevelFlagDefaultIsReachable(t *testing.T) { flag := parseCmd.Flag("level") if flag == nil { @@ -72,8 +78,79 @@ func TestBuildParsePyArgsBaseline(t *testing.T) { } } -// findFlag returns whether name is present in argv, and its following value -// (or "" if it has no value). +// --------------------------------------------------------------------------- +// --fresh flag registration +// --------------------------------------------------------------------------- + +func TestParseCmdHasFreshFlag(t *testing.T) { + flag := parseCmd.Flags().Lookup("fresh") + if flag == nil { + t.Fatal("parseCmd is missing the --fresh flag") + } + if flag.Value.Type() != "bool" { + t.Errorf("--fresh should be a bool flag, got type %q", flag.Value.Type()) + } + if flag.DefValue != "false" { + t.Errorf("--fresh default should be false, got %q", flag.DefValue) + } + if flag.Usage == "" { + t.Error("--fresh flag is missing a usage/help string") + } +} + +func TestParseCmdFreshFlagInitialState(t *testing.T) { + orig := parseFresh + defer func() { parseFresh = orig }() + + parseFresh = false + if parseFresh { + t.Errorf("parseFresh should default to false, got true") + } +} + +func TestParseCmdFreshFlagParses(t *testing.T) { + orig := parseFresh + defer func() { + parseFresh = orig + _ = parseCmd.Flags().Set("fresh", "false") + }() + + parseFresh = false + if err := parseCmd.Flags().Set("fresh", "true"); err != nil { + t.Fatalf("failed to set --fresh: %v", err) + } + if !parseFresh { + t.Error("setting --fresh=true should make parseFresh true") + } + + if err := parseCmd.Flags().Set("fresh", "false"); err != nil { + t.Fatalf("failed to set --fresh=false: %v", err) + } + if parseFresh { + t.Error("setting --fresh=false should make parseFresh false") + } +} + +func TestParseCmdIsRegisteredOnRoot(t *testing.T) { + var found *cobra.Command + for _, c := range rootCmd.Commands() { + if c.Name() == "parse" { + found = c + break + } + } + if found == nil { + t.Fatal("parse command not registered on rootCmd") + } + if found.Flags().Lookup("fresh") == nil { + t.Error("parse subcommand resolved from root is missing --fresh flag") + } +} + +// --------------------------------------------------------------------------- +// helpers +// --------------------------------------------------------------------------- + func findFlag(argv []string, name string) (bool, string) { for i, a := range argv { if a == name { diff --git a/libs/openant-core/tests/test_parse_fresh.py b/libs/openant-core/tests/test_parse_fresh.py new file mode 100644 index 0000000..db2472a --- /dev/null +++ b/libs/openant-core/tests/test_parse_fresh.py @@ -0,0 +1,149 @@ +"""Tests for the `--fresh` flag plumbing in core.parser_adapter.parse_repository. + +These tests stub out the language-specific parsers so we can verify the +pre-parse cleanup behavior of `fresh=True` in isolation, without relying +on the real Python/JS/Go parsers. +""" +import json +import os +from pathlib import Path + +import pytest + +from core import parser_adapter +from core.schemas import ParseResult + + +def _make_stub_parser(record): + """Build a fake `_parse_python` that records what it sees on disk. + + The stub captures whether `dataset.json` exists in `output_dir` at the + time it is invoked, then writes a fresh dataset itself so the rest of + `parse_repository` has something to work with. + """ + def _stub(repo_path, output_dir, processing_level, skip_tests=True, name=None): + dataset_path = os.path.join(output_dir, "dataset.json") + record["dataset_existed_when_parser_ran"] = os.path.exists(dataset_path) + # Mimic real parser output + with open(dataset_path, "w") as f: + json.dump({"units": [{"id": "u1", "code": "def f(): pass"}]}, f) + return ParseResult( + dataset_path=dataset_path, + analyzer_output_path=None, + units_count=1, + language="python", + processing_level=processing_level, + ) + return _stub + + +class TestParseFreshFlag: + def test_fresh_true_deletes_existing_dataset_before_parser_runs( + self, tmp_path, monkeypatch + ): + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + parser_adapter.parse_repository( + repo_path=str(tmp_path), # repo path not actually used by stub + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + ) + + # The pre-existing dataset.json must be gone by the time the + # parser runs, proving --fresh removed it before dispatch. + assert record["dataset_existed_when_parser_ran"] is False + + def test_fresh_false_leaves_existing_dataset_in_place( + self, tmp_path, monkeypatch + ): + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=False, + ) + + # Without --fresh the existing dataset must still be present when + # the parser is invoked (so the parser can decide whether to + # incrementally reuse it). + assert record["dataset_existed_when_parser_ran"] is True + + def test_fresh_default_is_false(self, tmp_path, monkeypatch): + """`fresh` must default to False so existing scans aren't wiped.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + # Note: no `fresh=` kwarg. + parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + ) + + assert record["dataset_existed_when_parser_ran"] is True + + def test_fresh_true_with_no_existing_dataset_is_noop( + self, tmp_path, monkeypatch + ): + """Passing --fresh when no dataset.json exists must not error.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + # Note: no pre-existing dataset.json + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + result = parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + ) + + # The parser still runs and produces a dataset + assert Path(result.dataset_path).exists() + assert record["dataset_existed_when_parser_ran"] is False + + def test_fresh_creates_output_dir_if_missing( + self, tmp_path, monkeypatch + ): + """`fresh=True` must not crash when output_dir doesn't yet exist.""" + output_dir = tmp_path / "does_not_exist_yet" + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + result = parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + ) + + assert output_dir.exists() + assert Path(result.dataset_path).exists() From dea910555b007e78759ec64453d9bed60bc32557 Mon Sep 17 00:00:00 2001 From: joshbouncesecurity Date: Mon, 4 May 2026 22:31:20 +0300 Subject: [PATCH 3/4] refactor: unify parse pyArgs builder and harden --fresh deletion - Extract buildParsePyArgs from runParse so the helper is the source of truth (tests no longer keep a parallel copy with 'keep in sync') - Replace exists()+remove() with try/except FileNotFoundError to avoid TOCTOU race when two --fresh parses run concurrently - Clarify --fresh help text and docstring: only dataset.json is deleted; other artifacts in the output dir are preserved --- apps/openant-cli/cmd/parse.go | 21 ++++++++++----------- apps/openant-cli/cmd/parse_test.go | 21 +++++++++++++++++++-- libs/openant-core/core/parser_adapter.py | 11 +++++++++-- libs/openant-core/openant/cli.py | 2 +- 4 files changed, 39 insertions(+), 16 deletions(-) diff --git a/apps/openant-cli/cmd/parse.go b/apps/openant-cli/cmd/parse.go index 5f42ae1..78fa838 100644 --- a/apps/openant-cli/cmd/parse.go +++ b/apps/openant-cli/cmd/parse.go @@ -39,15 +39,14 @@ func init() { parseCmd.Flags().StringVar(&parseDiffBase, "diff-base", "", "Incremental mode: tag units overlapping diff vs this ref") parseCmd.Flags().IntVar(&parsePR, "pr", 0, "Incremental mode against a GitHub PR number (mutex with --diff-base)") parseCmd.Flags().StringVar(&parseDiffScope, "diff-scope", "changed_functions", "Diff scope: changed_files, changed_functions, callers") - parseCmd.Flags().BoolVar(&parseFresh, "fresh", false, "Delete existing dataset and reparse from scratch") + parseCmd.Flags().BoolVar(&parseFresh, "fresh", false, "Delete existing dataset.json and reparse from scratch (other artifacts preserved)") } -// buildParsePyArgs assembles the argv passed to the Python `openant parse` -// subprocess. Defaults that match the Python CLI (language=auto, -// level=reachable) are omitted so the Python side stays in charge of the -// canonical default value. -func buildParsePyArgs(repoPath, output, datasetName, language, level, manifestPath string) []string { - pyArgs := []string{"parse", repoPath, "--output", output} +// buildParsePyArgs constructs the argv passed to the Python parse subcommand. +// Extracted so tests can verify pass-through behavior without invoking the +// full Python runtime. +func buildParsePyArgs(repoPath, outputDir, datasetName, language, level, manifestPath string, fresh bool) []string { + pyArgs := []string{"parse", repoPath, "--output", outputDir} if datasetName != "" { pyArgs = append(pyArgs, "--name", datasetName) } @@ -60,6 +59,9 @@ func buildParsePyArgs(repoPath, output, datasetName, language, level, manifestPa if manifestPath != "" { pyArgs = append(pyArgs, "--diff-manifest", manifestPath) } + if fresh { + pyArgs = append(pyArgs, "--fresh") + } return pyArgs } @@ -115,10 +117,7 @@ func runParse(cmd *cobra.Command, args []string) { } } - pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath) - if parseFresh { - pyArgs = append(pyArgs, "--fresh") - } + pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath, parseFresh) result, err := python.Invoke(rt.Path, pyArgs, "", quiet, resolvedAPIKey()) if err != nil { diff --git a/apps/openant-cli/cmd/parse_test.go b/apps/openant-cli/cmd/parse_test.go index 7c4b612..bd865df 100644 --- a/apps/openant-cli/cmd/parse_test.go +++ b/apps/openant-cli/cmd/parse_test.go @@ -46,7 +46,7 @@ func TestBuildParsePyArgsLevelForwarding(t *testing.T) { } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - args := buildParsePyArgs("/repo", "/out", "", "auto", tc.level, "") + args := buildParsePyArgs("/repo", "/out", "", "auto", tc.level, "", false) gotLevel, gotValue := findFlag(args, "--level") if gotLevel != tc.wantLevel { t.Errorf("--level present = %v, want %v (argv=%v)", gotLevel, tc.wantLevel, args) @@ -59,7 +59,7 @@ func TestBuildParsePyArgsLevelForwarding(t *testing.T) { } func TestBuildParsePyArgsBaseline(t *testing.T) { - args := buildParsePyArgs("/repo", "/out", "org-repo-abc1234", "python", "exploitable", "/tmp/manifest.json") + args := buildParsePyArgs("/repo", "/out", "org-repo-abc1234", "python", "exploitable", "/tmp/manifest.json", false) want := []string{ "parse", "/repo", "--output", "/out", @@ -131,6 +131,23 @@ func TestParseCmdFreshFlagParses(t *testing.T) { } } +func TestParsePyArgsIncludesFreshWhenSet(t *testing.T) { + args := buildParsePyArgs("/some/repo", "/out", "", "auto", "reachable", "", true) + + found, _ := findFlag(args, "--fresh") + if !found { + t.Errorf("expected --fresh in pyArgs when fresh=true, got %v", args) + } +} + +func TestParsePyArgsOmitsFreshWhenUnset(t *testing.T) { + args := buildParsePyArgs("/some/repo", "/out", "", "auto", "reachable", "", false) + + found, _ := findFlag(args, "--fresh") + if found { + t.Errorf("did not expect --fresh in pyArgs when fresh=false, got %v", args) + } +} func TestParseCmdIsRegisteredOnRoot(t *testing.T) { var found *cobra.Command for _, c := range rootCmd.Commands() { diff --git a/libs/openant-core/core/parser_adapter.py b/libs/openant-core/core/parser_adapter.py index 0337f4a..85ee7fa 100644 --- a/libs/openant-core/core/parser_adapter.py +++ b/libs/openant-core/core/parser_adapter.py @@ -94,7 +94,8 @@ def parse_repository( skip_tests: If True, exclude test files from parsing (default: True). name: Dataset name override (default: derived from repo path basename). fresh: If True, delete existing dataset.json before parsing so all - units are regenerated from scratch. + units are regenerated from scratch. Only dataset.json is deleted; + other artifacts in output_dir (e.g. analyzer outputs) are preserved. Returns: ParseResult with paths to generated files and stats. @@ -109,9 +110,15 @@ def parse_repository( if fresh: dataset_path = os.path.join(output_dir, "dataset.json") - if os.path.exists(dataset_path): + # Use try/except instead of exists()+remove() to avoid a TOCTOU race + # if a concurrent --fresh run removes the file between the two calls. + # Only dataset.json is deleted; other artifacts (analyzer outputs, etc.) + # in output_dir are preserved. + try: os.remove(dataset_path) print("[Parser] --fresh: deleted existing dataset.json", file=sys.stderr) + except FileNotFoundError: + pass # Detect language if auto if language == "auto": diff --git a/libs/openant-core/openant/cli.py b/libs/openant-core/openant/cli.py index ae8ffb3..fd92921 100644 --- a/libs/openant-core/openant/cli.py +++ b/libs/openant-core/openant/cli.py @@ -1040,7 +1040,7 @@ def main(): parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)") parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected") parse_p.add_argument("--fresh", action="store_true", - help="Delete existing dataset and reparse from scratch (default: reuse existing units)") + help="Delete existing dataset.json and reparse from scratch (default: reuse existing units; other artifacts preserved)") parse_p.set_defaults(func=cmd_parse) # --------------------------------------------------------------- From 7ece34f03ab197fcb2c79a9f70607be6c561bcfa Mon Sep 17 00:00:00 2001 From: joshbouncesecurity Date: Thu, 14 May 2026 16:23:34 +0300 Subject: [PATCH 4/4] fix: address review feedback from ar7casper (refs #16 item 19) - Restore --level default to "reachable" and guard to level != "reachable" (reverts regression introduced when branch diverged from upstream fix #35) - Restore upstream parse_test.go level tests, updated to 7-arg signature - Fix JS duplicate-units hint to say 'openant parse --fresh' not --fresh - Add test: --fresh + --diff-manifest compose correctly Co-Authored-By: Claude Sonnet 4.6 --- apps/openant-cli/cmd/parse_test.go | 2 +- .../parsers/javascript/unit_generator.js | 2 +- libs/openant-core/tests/test_parse_fresh.py | 30 +++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/apps/openant-cli/cmd/parse_test.go b/apps/openant-cli/cmd/parse_test.go index bd865df..63c0884 100644 --- a/apps/openant-cli/cmd/parse_test.go +++ b/apps/openant-cli/cmd/parse_test.go @@ -79,7 +79,7 @@ func TestBuildParsePyArgsBaseline(t *testing.T) { } // --------------------------------------------------------------------------- -// --fresh flag registration +// --fresh flag // --------------------------------------------------------------------------- func TestParseCmdHasFreshFlag(t *testing.T) { diff --git a/libs/openant-core/parsers/javascript/unit_generator.js b/libs/openant-core/parsers/javascript/unit_generator.js index 2c535de..31f9bd9 100644 --- a/libs/openant-core/parsers/javascript/unit_generator.js +++ b/libs/openant-core/parsers/javascript/unit_generator.js @@ -434,7 +434,7 @@ if (require.main === module) { console.error(` New units to add: ${newUnits.length}`); console.error(` Duplicates skipped: ${duplicateCount}`); if (duplicateCount > 0) { - console.error(` Note: ${duplicateCount} existing units kept as-is (use --fresh to regenerate all units)`); + console.error(` Note: ${duplicateCount} existing units kept as-is (use 'openant parse --fresh' to regenerate all units)`); } // Append new units to existing diff --git a/libs/openant-core/tests/test_parse_fresh.py b/libs/openant-core/tests/test_parse_fresh.py index db2472a..93f04d3 100644 --- a/libs/openant-core/tests/test_parse_fresh.py +++ b/libs/openant-core/tests/test_parse_fresh.py @@ -147,3 +147,33 @@ def test_fresh_creates_output_dir_if_missing( assert output_dir.exists() assert Path(result.dataset_path).exists() + + def test_fresh_and_diff_manifest_compose_correctly( + self, tmp_path, monkeypatch + ): + """--fresh cleans up before the parser runs even when --diff-manifest is also set.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + # Stub the diff filter so the test doesn't need a real manifest format. + monkeypatch.setattr(parser_adapter, "_maybe_apply_diff_filter", lambda *a, **kw: None) + + manifest_path = tmp_path / "diff_manifest.json" + manifest_path.write_text(json.dumps({})) + + parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + diff_manifest=str(manifest_path), + ) + + # --fresh must delete dataset.json before the parser runs even when + # --diff-manifest is also provided; the two flags must not interfere. + assert record["dataset_existed_when_parser_ran"] is False