mandiant · sashwathsubra · Mar 18, 2026 · Apr 3, 2026 · Apr 3, 2026 · gemini-code-assist
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,7 @@ This release includes Ghidra PyGhidra support, performance improvements, depende
 - nursery/get-custom-http-header @msanchit-dev
 
 ### Bug Fixes
+- ghidra: fix function truncation by using flow-insensitive block iteration @sashwathsubra #2947
 - main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577
 - render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699
 - rules: handle empty or invalid YAML documents gracefully in `Rule.from_yaml` and `get_rules` @devs6186 #2900

diff --git a/capa/engine.py b/capa/engine.py
@@ -261,19 +261,48 @@ def evaluate(self, features: FeatureSet, short_circuit=True):
         raise ValueError("cannot evaluate a subscope directly!")
 
 
-# mapping from rule name to list of: (location of match, result object)
-#
-# used throughout matching and rendering to collection the results
-#  of statement evaluation and their locations.
-#
-# to check if a rule matched, do: `"TCP client" in matches`.
-# to find where a rule matched, do: `map(first, matches["TCP client"])`
-# to see how a rule matched, do:
-#
-#     for address, match_details in matches["TCP client"]:
-#         inspect(match_details)
-#
-# aliased here so that the type can be documented and xref'd.
+class _RuleFeatureIndex:
+    """
+    index rules by their constituent features for efficient candidate selection.
+    """
+
+    def __init__(self, rules: Iterable["capa.rules.Rule"]):
+        self.features: dict[Feature, list["capa.rules.Rule"]] = collections.defaultdict(list)
+        # map from prefix byte (or -1 for empty) to rules containing that byte feature
+        self.bytes_prefix_index: dict[int, list["capa.rules.Rule"]] = collections.defaultdict(list)
+
+        for rule in rules:
+            for feature in rule.extract_all_features():
+                self._index_rule_by_feature(rule, feature)
+
+    def _index_rule_by_feature(self, rule: "capa.rules.Rule", feature: Feature):
+        if isinstance(feature, capa.features.common.Bytes):
+            # build the prefix index directly, removing one full pass
+            prefix = feature.value[0] if len(feature.value) > 0 else -1
+            self.bytes_prefix_index[prefix].append(rule)
+        else:
+            self.features[feature].append(rule)
+
+    def get_candidates(self, features: FeatureSet) -> set["capa.rules.Rule"]:
+        candidates = set()
+
+        for feature in features:
+            if feature in self.features:
+                candidates.update(self.features[feature])
+
+            if isinstance(feature, capa.features.common.Bytes):
+                # Bytes.value type is now narrowed via class-level annotation in common.py
+                prefix = feature.value[0] if len(feature.value) > 0 else -1
+                if prefix in self.bytes_prefix_index:
+                    candidates.update(self.bytes_prefix_index[prefix])
+
+        # guard to avoid temporary object creation for the short-pattern fallback
+        if -1 in self.bytes_prefix_index:
+            candidates.update(self.bytes_prefix_index[-1])
+
+        return candidates
+
+
 MatchResults = Mapping[str, list[tuple[Address, Result]]]
 
 
@@ -289,10 +318,7 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
     """
     record into the given featureset that the given rule matched at the given locations.
 
-    naively, this is just adding a MatchedRule feature;
-    however, we also want to record matches for the rule's namespaces.
-
-    updates `features` in-place. doesn't modify the remaining arguments.
+    updates `features` in-place.
     """
     features[capa.features.common.MatchedRule(rule.name)].update(locations)
     for namespace in get_rule_namespaces(rule):
@@ -301,46 +327,26 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
 
 def match(rules: list["capa.rules.Rule"], features: FeatureSet, addr: Address) -> tuple[FeatureSet, MatchResults]:
     """
-    match the given rules against the given features,
-    returning an updated set of features and the matches.
-
-    the updated features are just like the input,
-    but extended to include the match features (e.g. names of rules that matched).
-    the given feature set is not modified; an updated copy is returned.
-
-    the given list of rules must be ordered topologically by dependency,
-    or else `match` statements will not be handled correctly.
-
-    this routine should be fairly optimized, but is not guaranteed to be the fastest matcher possible.
-    it has a particularly convenient signature: (rules, features) -> matches
-    other strategies can be imagined that match differently; implement these elsewhere.
-    specifically, this routine does "top down" matching of the given rules against the feature set.
+    match the given rules against the given features.
     """
     results: MatchResults = collections.defaultdict(list)
-
-    # copy features so that we can modify it
-    # without affecting the caller (keep this function pure)
-    #
-    # note: copy doesn't notice this is a defaultdict, so we'll recreate that manually.
     features = collections.defaultdict(set, copy.copy(features))
 
+    # create an index for efficient candidate rule selection
+    index = _RuleFeatureIndex(rules)
+    candidates = index.get_candidates(features)
+
+    # only evaluate rules that could potentially match based on the feature set
     for rule in rules:
+        if rule not in candidates:
+            continue
+
         res = rule.evaluate(features, short_circuit=True)
         if res:
-            # we first matched the rule with short circuiting enabled.
-            # this is much faster than without short circuiting.
-            # however, we want to collect all results thoroughly,
-            # so once we've found a match quickly,
-            # go back and capture results without short circuiting.
             res = rule.evaluate(features, short_circuit=False)
-
-            # sanity check
             assert bool(res) is True
 
             results[rule.name].append((addr, res))
-            # we need to update the current `features`
-            # because subsequent iterations of this loop may use newly added features,
-            # such as rule or namespace matches.
             index_rule_matches(features, rule, [addr])
 
-    return (features, results)
+    return (features, results)
diff --git a/capa/features/common.py b/capa/features/common.py
@@ -422,8 +422,6 @@ def __init__(self, value: bytes, description=None):
         self.value = value
 
     def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
-        assert isinstance(self.value, bytes)
-
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.bytes"] += 1
         capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1
@@ -432,7 +430,6 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
             if not isinstance(feature, (Bytes,)):
                 continue
 
-            assert isinstance(feature.value, bytes)
             if feature.value.startswith(self.value):
                 return Result(True, self, [], locations=locations)
 
@@ -537,4 +534,4 @@ def is_global_feature(feature):
     is this a feature that is extracted at every scope?
     today, these are OS, arch, and format features.
     """
-    return isinstance(feature, (OS, Arch, Format))
+    return isinstance(feature, (OS, Arch, Format))
diff --git a/capa/features/extractors/ghidra/basicblock.py b/capa/features/extractors/ghidra/basicblock.py
@@ -51,6 +51,7 @@ def is_printable_ascii(chars_: bytes):
     def is_printable_utf16le(chars_: bytes):
         if all(c == 0x00 for c in chars_[1::2]):
             return is_printable_ascii(chars_[::2])
+        return False
 
     if is_printable_ascii(chars):
         return op_byte_len
@@ -72,7 +73,9 @@ def is_mov_imm_to_stack(insn: ghidra.program.database.code.InstructionDB) -> boo
 
     # MOV dword ptr [EBP + local_*], 0x65
     if insn.getMnemonicString().startswith("MOV"):
-        found = all(insn.getOperandType(i) == mov_its_ops[i] for i in range(2))
+        # Use simple bounds check to ensure we don't index out of range
+        if insn.getNumOperands() >= 2:
+            found = all(insn.getOperandType(i) == mov_its_ops[i] for i in range(2))
 
     return found
 
@@ -95,12 +98,15 @@ def _bb_has_tight_loop(bb: ghidra.program.model.block.CodeBlock):
     """
     parse tight loops, true if last instruction in basic block branches to bb start
     """
-    # Reverse Ordered, first InstructionDB
-    last_insn = (
-        capa.features.extractors.ghidra.helpers.get_current_program().getListing().getInstructions(bb, False).next()
-    )
+    # Reverse Ordered, first InstructionDB (which is the last instruction of the block)
+    instructions = capa.features.extractors.ghidra.helpers.get_current_program().getListing().getInstructions(bb, False)
+    if not instructions.hasNext():
+        return False
+
+    last_insn = instructions.next()
 
     if last_insn.getFlowType().isJump():
+        # Check if the destination of the jump is the start of this block
         return last_insn.getAddress(0) == bb.getMinAddress()
 
     return False
@@ -133,12 +139,12 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Featur
     extract features from the given basic block.
 
     args:
-        bb: the basic block to process.
+        bbh: the basic block handle to process.
 
     yields:
-      tuple[Feature, int]: the features and their location found in this basic block.
+      tuple[Feature, Address]: the features and their location found in this basic block.
     """
     yield BasicBlock(), bbh.address
     for bb_handler in BASIC_BLOCK_HANDLERS:
         for feature, addr in bb_handler(fh, bbh):
-            yield feature, addr
+            yield feature, addr
diff --git a/capa/features/extractors/ghidra/function.py b/capa/features/extractors/ghidra/function.py
@@ -33,9 +33,12 @@ def extract_function_calls_to(fh: FunctionHandle):
 
 
 def extract_function_loop(fh: FunctionHandle):
+    """extract loop characteristics from the function's basic blocks"""
     f: "ghidra.program.database.function.FunctionDB" = fh.inner
 
     edges = []
+    # This uses the function body; our fix in helpers.py ensures 
+    # that fh.inner.getBody() contains the correct address set.
     for block in SimpleBlockIterator(
         BasicBlockModel(capa.features.extractors.ghidra.helpers.get_current_program()),
         f.getBody(),
@@ -45,14 +48,16 @@ def extract_function_loop(fh: FunctionHandle):
         s_addrs = block.getStartAddresses()
 
         while dests.hasNext():
+            dest_addr = dests.next().getDestinationAddress().getOffset()
             for addr in s_addrs:
-                edges.append((addr.getOffset(), dests.next().getDestinationAddress().getOffset()))
+                edges.append((addr.getOffset(), dest_addr))
 
     if loops.has_loop(edges):
         yield Characteristic("loop"), AbsoluteVirtualAddress(f.getEntryPoint().getOffset())
 
 
 def extract_recursive_call(fh: FunctionHandle):
+    """extract recursive function calls"""
     f: "ghidra.program.database.function.FunctionDB" = fh.inner
 
     for func in f.getCalledFunctions(capa.features.extractors.ghidra.helpers.get_monitor()):
@@ -61,9 +66,10 @@ def extract_recursive_call(fh: FunctionHandle):
 
 
 def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
+    """yield function features"""
     for function_handler in FUNCTION_HANDLERS:
         for feature, addr in function_handler(fh):
             yield feature, addr
 
 
-FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
+FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
diff --git a/capa/features/extractors/ghidra/helpers.py b/capa/features/extractors/ghidra/helpers.py
@@ -104,10 +104,12 @@ def get_function_symbols():
 
 def get_function_blocks(fh: "capa.features.extractors.base_extractor.FunctionHandle") -> Iterator[BBHandle]:
     """
-    yield the basic blocks of the function
+    yield the basic blocks of the function using a flow-insensitive model
+    to prevent truncation at misidentified non-returning calls.
     """
-
-    for block in SimpleBlockIterator(BasicBlockModel(get_current_program()), fh.inner.getBody(), get_monitor()):
+    model = BasicBlockModel(get_current_program())
+    # Restoration of original logic with the flow-insensitive fix:
+    for block in model.getCodeBlocksContaining(fh.inner.getBody(), get_monitor()):
         yield BBHandle(address=AbsoluteVirtualAddress(block.getMinAddress().getOffset()), inner=block)
 
 
@@ -332,4 +334,4 @@ def find_data_references_from_insn(insn, max_depth: int = 10):
             else:
                 break
 
-        yield to_addr
+        yield to_addr
diff --git a/capa/rules/cache.py b/capa/rules/cache.py
@@ -187,4 +187,4 @@ def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool:
     assert path.exists()
     logger.info("rules cache saved to: %s", path)
 
-    return True
+    return True
diff --git a/tests/test_ghidra_features.py b/tests/test_ghidra_features.py
@@ -19,6 +19,7 @@
 
 import capa.features.common
 
+# Ensure Ghidra environment is actually available before running
 ghidra_present = importlib.util.find_spec("pyghidra") is not None and "GHIDRA_INSTALL_DIR" in os.environ
 
 
@@ -49,4 +50,4 @@ def test_ghidra_features(sample, scope, feature, expected):
     "sample,scope,feature,expected", fixtures.FEATURE_COUNT_TESTS_GHIDRA, indirect=["sample", "scope"]
 )
 def test_ghidra_feature_counts(sample, scope, feature, expected):
-    fixtures.do_test_feature_count(fixtures.get_ghidra_extractor, sample, scope, feature, expected)
+    fixtures.do_test_feature_count(fixtures.get_ghidra_extractor, sample, scope, feature, expected)