From e74b459670a0fba6b5bbc683903f86dd53dbc14c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 13:47:57 +0300 Subject: [PATCH 1/4] build(deps): bump picomatch from 4.0.3 to 4.0.4 in /libs/openant-core/parsers/javascript (#19) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- libs/openant-core/parsers/javascript/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/openant-core/parsers/javascript/package-lock.json b/libs/openant-core/parsers/javascript/package-lock.json index 8a6409e..555e537 100644 --- a/libs/openant-core/parsers/javascript/package-lock.json +++ b/libs/openant-core/parsers/javascript/package-lock.json @@ -126,9 +126,9 @@ "license": "MIT" }, "node_modules/picomatch": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", - "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "license": "MIT", "engines": { "node": ">=12" From b04078f6c56a12a2bc9917b55d0206773188b7f3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 13:51:03 +0300 Subject: [PATCH 2/4] build(deps): bump minimatch from 10.2.2 to 10.2.4 in /libs/openant-core/parsers/javascript (#3) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- libs/openant-core/parsers/javascript/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/openant-core/parsers/javascript/package-lock.json b/libs/openant-core/parsers/javascript/package-lock.json index 555e537..1094d5d 100644 --- a/libs/openant-core/parsers/javascript/package-lock.json +++ b/libs/openant-core/parsers/javascript/package-lock.json @@ -99,9 +99,9 @@ } }, "node_modules/minimatch": { - "version": "10.2.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.2.tgz", - "integrity": "sha512-+G4CpNBxa5MprY+04MbgOw1v7So6n5JY166pFi9KfYwT78fxScCeSNQSNzp6dpPSW2rONOps6Ocam1wFhCgoVw==", + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.4.tgz", + "integrity": "sha512-oRjTw/97aTBN0RHbYCdtF1MQfvusSIBQM0IZEgzl6426+8jSC0nF1a/GmnVLpfB9yyr6g6FTqWqiZVbxrtaCIg==", "license": "BlueOak-1.0.0", "dependencies": { "brace-expansion": "^5.0.2" From 25709b3a0e82dc3bb4641dd166e5d63d2ef64814 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 15:21:02 +0300 Subject: [PATCH 3/4] build(deps): bump brace-expansion from 5.0.3 to 5.0.6 in /libs/openant-core/parsers/javascript (#20) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: ar7casper --- libs/openant-core/parsers/javascript/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/openant-core/parsers/javascript/package-lock.json b/libs/openant-core/parsers/javascript/package-lock.json index 1094d5d..95234d0 100644 --- a/libs/openant-core/parsers/javascript/package-lock.json +++ b/libs/openant-core/parsers/javascript/package-lock.json @@ -47,9 +47,9 @@ } }, "node_modules/brace-expansion": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.3.tgz", - "integrity": "sha512-fy6KJm2RawA5RcHkLa1z/ScpBeA762UF9KmZQxwIbDtRJrgLzM10depAiEQ+CXYcoiqW1/m96OAAoke2nE9EeA==", + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.6.tgz", + "integrity": "sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==", "license": "MIT", "dependencies": { "balanced-match": "^4.0.2" From b456bbee558554b9a7559455f93066f5d1f5e4f6 Mon Sep 17 00:00:00 2001 From: "Josh Grossman (Bounce Security)" <97975715+joshbouncesecurity@users.noreply.github.com> Date: Sun, 17 May 2026 11:38:05 +0300 Subject: [PATCH 4/4] feat: add --fresh flag to parse for forced reparse (#38) --- apps/openant-cli/cmd/parse.go | 18 +- apps/openant-cli/cmd/parse_test.go | 102 +++++++++- libs/openant-core/core/parser_adapter.py | 16 ++ libs/openant-core/openant/cli.py | 3 + .../parsers/javascript/unit_generator.js | 3 + libs/openant-core/tests/test_parse_fresh.py | 179 ++++++++++++++++++ 6 files changed, 310 insertions(+), 11 deletions(-) create mode 100644 libs/openant-core/tests/test_parse_fresh.py diff --git a/apps/openant-cli/cmd/parse.go b/apps/openant-cli/cmd/parse.go index 563ca5a..78fa838 100644 --- a/apps/openant-cli/cmd/parse.go +++ b/apps/openant-cli/cmd/parse.go @@ -29,6 +29,7 @@ var ( parseDiffBase string parsePR int parseDiffScope string + parseFresh bool ) func init() { @@ -38,14 +39,14 @@ func init() { parseCmd.Flags().StringVar(&parseDiffBase, "diff-base", "", "Incremental mode: tag units overlapping diff vs this ref") parseCmd.Flags().IntVar(&parsePR, "pr", 0, "Incremental mode against a GitHub PR number (mutex with --diff-base)") parseCmd.Flags().StringVar(&parseDiffScope, "diff-scope", "changed_functions", "Diff scope: changed_files, changed_functions, callers") + parseCmd.Flags().BoolVar(&parseFresh, "fresh", false, "Delete existing dataset.json and reparse from scratch (other artifacts preserved)") } -// buildParsePyArgs assembles the argv passed to the Python `openant parse` -// subprocess. Defaults that match the Python CLI (language=auto, -// level=reachable) are omitted so the Python side stays in charge of the -// canonical default value. -func buildParsePyArgs(repoPath, output, datasetName, language, level, manifestPath string) []string { - pyArgs := []string{"parse", repoPath, "--output", output} +// buildParsePyArgs constructs the argv passed to the Python parse subcommand. +// Extracted so tests can verify pass-through behavior without invoking the +// full Python runtime. +func buildParsePyArgs(repoPath, outputDir, datasetName, language, level, manifestPath string, fresh bool) []string { + pyArgs := []string{"parse", repoPath, "--output", outputDir} if datasetName != "" { pyArgs = append(pyArgs, "--name", datasetName) } @@ -58,6 +59,9 @@ func buildParsePyArgs(repoPath, output, datasetName, language, level, manifestPa if manifestPath != "" { pyArgs = append(pyArgs, "--diff-manifest", manifestPath) } + if fresh { + pyArgs = append(pyArgs, "--fresh") + } return pyArgs } @@ -113,7 +117,7 @@ func runParse(cmd *cobra.Command, args []string) { } } - pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath) + pyArgs := buildParsePyArgs(repoPath, parseOutput, datasetName, parseLanguage, parseLevel, manifestPath, parseFresh) result, err := python.Invoke(rt.Path, pyArgs, "", quiet, resolvedAPIKey()) if err != nil { diff --git a/apps/openant-cli/cmd/parse_test.go b/apps/openant-cli/cmd/parse_test.go index e080df2..63c0884 100644 --- a/apps/openant-cli/cmd/parse_test.go +++ b/apps/openant-cli/cmd/parse_test.go @@ -3,8 +3,14 @@ package cmd import ( "strings" "testing" + + "github.com/spf13/cobra" ) +// --------------------------------------------------------------------------- +// --level flag +// --------------------------------------------------------------------------- + func TestParseLevelFlagDefaultIsReachable(t *testing.T) { flag := parseCmd.Flag("level") if flag == nil { @@ -40,7 +46,7 @@ func TestBuildParsePyArgsLevelForwarding(t *testing.T) { } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - args := buildParsePyArgs("/repo", "/out", "", "auto", tc.level, "") + args := buildParsePyArgs("/repo", "/out", "", "auto", tc.level, "", false) gotLevel, gotValue := findFlag(args, "--level") if gotLevel != tc.wantLevel { t.Errorf("--level present = %v, want %v (argv=%v)", gotLevel, tc.wantLevel, args) @@ -53,7 +59,7 @@ func TestBuildParsePyArgsLevelForwarding(t *testing.T) { } func TestBuildParsePyArgsBaseline(t *testing.T) { - args := buildParsePyArgs("/repo", "/out", "org-repo-abc1234", "python", "exploitable", "/tmp/manifest.json") + args := buildParsePyArgs("/repo", "/out", "org-repo-abc1234", "python", "exploitable", "/tmp/manifest.json", false) want := []string{ "parse", "/repo", "--output", "/out", @@ -72,8 +78,96 @@ func TestBuildParsePyArgsBaseline(t *testing.T) { } } -// findFlag returns whether name is present in argv, and its following value -// (or "" if it has no value). +// --------------------------------------------------------------------------- +// --fresh flag +// --------------------------------------------------------------------------- + +func TestParseCmdHasFreshFlag(t *testing.T) { + flag := parseCmd.Flags().Lookup("fresh") + if flag == nil { + t.Fatal("parseCmd is missing the --fresh flag") + } + if flag.Value.Type() != "bool" { + t.Errorf("--fresh should be a bool flag, got type %q", flag.Value.Type()) + } + if flag.DefValue != "false" { + t.Errorf("--fresh default should be false, got %q", flag.DefValue) + } + if flag.Usage == "" { + t.Error("--fresh flag is missing a usage/help string") + } +} + +func TestParseCmdFreshFlagInitialState(t *testing.T) { + orig := parseFresh + defer func() { parseFresh = orig }() + + parseFresh = false + if parseFresh { + t.Errorf("parseFresh should default to false, got true") + } +} + +func TestParseCmdFreshFlagParses(t *testing.T) { + orig := parseFresh + defer func() { + parseFresh = orig + _ = parseCmd.Flags().Set("fresh", "false") + }() + + parseFresh = false + if err := parseCmd.Flags().Set("fresh", "true"); err != nil { + t.Fatalf("failed to set --fresh: %v", err) + } + if !parseFresh { + t.Error("setting --fresh=true should make parseFresh true") + } + + if err := parseCmd.Flags().Set("fresh", "false"); err != nil { + t.Fatalf("failed to set --fresh=false: %v", err) + } + if parseFresh { + t.Error("setting --fresh=false should make parseFresh false") + } +} + +func TestParsePyArgsIncludesFreshWhenSet(t *testing.T) { + args := buildParsePyArgs("/some/repo", "/out", "", "auto", "reachable", "", true) + + found, _ := findFlag(args, "--fresh") + if !found { + t.Errorf("expected --fresh in pyArgs when fresh=true, got %v", args) + } +} + +func TestParsePyArgsOmitsFreshWhenUnset(t *testing.T) { + args := buildParsePyArgs("/some/repo", "/out", "", "auto", "reachable", "", false) + + found, _ := findFlag(args, "--fresh") + if found { + t.Errorf("did not expect --fresh in pyArgs when fresh=false, got %v", args) + } +} +func TestParseCmdIsRegisteredOnRoot(t *testing.T) { + var found *cobra.Command + for _, c := range rootCmd.Commands() { + if c.Name() == "parse" { + found = c + break + } + } + if found == nil { + t.Fatal("parse command not registered on rootCmd") + } + if found.Flags().Lookup("fresh") == nil { + t.Error("parse subcommand resolved from root is missing --fresh flag") + } +} + +// --------------------------------------------------------------------------- +// helpers +// --------------------------------------------------------------------------- + func findFlag(argv []string, name string) (bool, string) { for i, a := range argv { if a == name { diff --git a/libs/openant-core/core/parser_adapter.py b/libs/openant-core/core/parser_adapter.py index 605450a..85ee7fa 100644 --- a/libs/openant-core/core/parser_adapter.py +++ b/libs/openant-core/core/parser_adapter.py @@ -79,6 +79,7 @@ def parse_repository( skip_tests: bool = True, name: str = None, diff_manifest: str | None = None, + fresh: bool = False, ) -> ParseResult: """Parse a repository into an OpenAnt dataset. @@ -92,6 +93,9 @@ def parse_repository( processing_level: "all", "reachable", "codeql", or "exploitable". skip_tests: If True, exclude test files from parsing (default: True). name: Dataset name override (default: derived from repo path basename). + fresh: If True, delete existing dataset.json before parsing so all + units are regenerated from scratch. Only dataset.json is deleted; + other artifacts in output_dir (e.g. analyzer outputs) are preserved. Returns: ParseResult with paths to generated files and stats. @@ -104,6 +108,18 @@ def parse_repository( output_dir = os.path.abspath(output_dir) os.makedirs(output_dir, exist_ok=True) + if fresh: + dataset_path = os.path.join(output_dir, "dataset.json") + # Use try/except instead of exists()+remove() to avoid a TOCTOU race + # if a concurrent --fresh run removes the file between the two calls. + # Only dataset.json is deleted; other artifacts (analyzer outputs, etc.) + # in output_dir are preserved. + try: + os.remove(dataset_path) + print("[Parser] --fresh: deleted existing dataset.json", file=sys.stderr) + except FileNotFoundError: + pass + # Detect language if auto if language == "auto": language = detect_language(repo_path) diff --git a/libs/openant-core/openant/cli.py b/libs/openant-core/openant/cli.py index c303c64..fd92921 100644 --- a/libs/openant-core/openant/cli.py +++ b/libs/openant-core/openant/cli.py @@ -128,6 +128,7 @@ def cmd_parse(args): skip_tests=not args.no_skip_tests, name=getattr(args, "name", None), diff_manifest=getattr(args, "diff_manifest", None), + fresh=getattr(args, "fresh", False), ) ctx.summary = { @@ -1038,6 +1039,8 @@ def main(): parse_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)") parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)") parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected") + parse_p.add_argument("--fresh", action="store_true", + help="Delete existing dataset.json and reparse from scratch (default: reuse existing units; other artifacts preserved)") parse_p.set_defaults(func=cmd_parse) # --------------------------------------------------------------- diff --git a/libs/openant-core/parsers/javascript/unit_generator.js b/libs/openant-core/parsers/javascript/unit_generator.js index 7b76219..31f9bd9 100644 --- a/libs/openant-core/parsers/javascript/unit_generator.js +++ b/libs/openant-core/parsers/javascript/unit_generator.js @@ -433,6 +433,9 @@ if (require.main === module) { console.error(` Existing units: ${existingUnits.length}`); console.error(` New units to add: ${newUnits.length}`); console.error(` Duplicates skipped: ${duplicateCount}`); + if (duplicateCount > 0) { + console.error(` Note: ${duplicateCount} existing units kept as-is (use 'openant parse --fresh' to regenerate all units)`); + } // Append new units to existing finalResult = { diff --git a/libs/openant-core/tests/test_parse_fresh.py b/libs/openant-core/tests/test_parse_fresh.py new file mode 100644 index 0000000..93f04d3 --- /dev/null +++ b/libs/openant-core/tests/test_parse_fresh.py @@ -0,0 +1,179 @@ +"""Tests for the `--fresh` flag plumbing in core.parser_adapter.parse_repository. + +These tests stub out the language-specific parsers so we can verify the +pre-parse cleanup behavior of `fresh=True` in isolation, without relying +on the real Python/JS/Go parsers. +""" +import json +import os +from pathlib import Path + +import pytest + +from core import parser_adapter +from core.schemas import ParseResult + + +def _make_stub_parser(record): + """Build a fake `_parse_python` that records what it sees on disk. + + The stub captures whether `dataset.json` exists in `output_dir` at the + time it is invoked, then writes a fresh dataset itself so the rest of + `parse_repository` has something to work with. + """ + def _stub(repo_path, output_dir, processing_level, skip_tests=True, name=None): + dataset_path = os.path.join(output_dir, "dataset.json") + record["dataset_existed_when_parser_ran"] = os.path.exists(dataset_path) + # Mimic real parser output + with open(dataset_path, "w") as f: + json.dump({"units": [{"id": "u1", "code": "def f(): pass"}]}, f) + return ParseResult( + dataset_path=dataset_path, + analyzer_output_path=None, + units_count=1, + language="python", + processing_level=processing_level, + ) + return _stub + + +class TestParseFreshFlag: + def test_fresh_true_deletes_existing_dataset_before_parser_runs( + self, tmp_path, monkeypatch + ): + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + parser_adapter.parse_repository( + repo_path=str(tmp_path), # repo path not actually used by stub + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + ) + + # The pre-existing dataset.json must be gone by the time the + # parser runs, proving --fresh removed it before dispatch. + assert record["dataset_existed_when_parser_ran"] is False + + def test_fresh_false_leaves_existing_dataset_in_place( + self, tmp_path, monkeypatch + ): + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=False, + ) + + # Without --fresh the existing dataset must still be present when + # the parser is invoked (so the parser can decide whether to + # incrementally reuse it). + assert record["dataset_existed_when_parser_ran"] is True + + def test_fresh_default_is_false(self, tmp_path, monkeypatch): + """`fresh` must default to False so existing scans aren't wiped.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + # Note: no `fresh=` kwarg. + parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + ) + + assert record["dataset_existed_when_parser_ran"] is True + + def test_fresh_true_with_no_existing_dataset_is_noop( + self, tmp_path, monkeypatch + ): + """Passing --fresh when no dataset.json exists must not error.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + # Note: no pre-existing dataset.json + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + result = parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + ) + + # The parser still runs and produces a dataset + assert Path(result.dataset_path).exists() + assert record["dataset_existed_when_parser_ran"] is False + + def test_fresh_creates_output_dir_if_missing( + self, tmp_path, monkeypatch + ): + """`fresh=True` must not crash when output_dir doesn't yet exist.""" + output_dir = tmp_path / "does_not_exist_yet" + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + + result = parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + ) + + assert output_dir.exists() + assert Path(result.dataset_path).exists() + + def test_fresh_and_diff_manifest_compose_correctly( + self, tmp_path, monkeypatch + ): + """--fresh cleans up before the parser runs even when --diff-manifest is also set.""" + output_dir = tmp_path / "output" + output_dir.mkdir() + existing = output_dir / "dataset.json" + existing.write_text(json.dumps({"units": [{"id": "stale"}]})) + + record = {} + monkeypatch.setattr(parser_adapter, "_parse_python", _make_stub_parser(record)) + # Stub the diff filter so the test doesn't need a real manifest format. + monkeypatch.setattr(parser_adapter, "_maybe_apply_diff_filter", lambda *a, **kw: None) + + manifest_path = tmp_path / "diff_manifest.json" + manifest_path.write_text(json.dumps({})) + + parser_adapter.parse_repository( + repo_path=str(tmp_path), + output_dir=str(output_dir), + language="python", + processing_level="all", + fresh=True, + diff_manifest=str(manifest_path), + ) + + # --fresh must delete dataset.json before the parser runs even when + # --diff-manifest is also provided; the two flags must not interfere. + assert record["dataset_existed_when_parser_ran"] is False