Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ This release includes Ghidra PyGhidra support, performance improvements, depende
- nursery/get-custom-http-header @msanchit-dev

### Bug Fixes
- ghidra: fix function truncation by using flow-insensitive block iteration @sashwathsubra #2947
- main: suggest --os flag in unsupported OS error message to help users override ELF OS detection @devs6186 #2577
- render: escape sample-controlled strings before passing to Rich to prevent MarkupError @devs6186 #2699
- rules: handle empty or invalid YAML documents gracefully in `Rule.from_yaml` and `get_rules` @devs6186 #2900
Expand Down
100 changes: 53 additions & 47 deletions capa/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,19 +261,48 @@ def evaluate(self, features: FeatureSet, short_circuit=True):
raise ValueError("cannot evaluate a subscope directly!")


# mapping from rule name to list of: (location of match, result object)
#
# used throughout matching and rendering to collection the results
# of statement evaluation and their locations.
#
# to check if a rule matched, do: `"TCP client" in matches`.
# to find where a rule matched, do: `map(first, matches["TCP client"])`
# to see how a rule matched, do:
#
# for address, match_details in matches["TCP client"]:
# inspect(match_details)
#
# aliased here so that the type can be documented and xref'd.
class _RuleFeatureIndex:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The _RuleFeatureIndex implementation has a critical correctness bug: it breaks rules that use Substring or Regex features. These features match against String features in the FeatureSet via partial or pattern matching. However, get_candidates performs an exact lookup (feature in self.features). Since the extracted features in the FeatureSet are String objects and the indexed features are Substring/Regex objects, they will never match exactly, causing these rules to be incorrectly filtered out and never evaluated.

"""
index rules by their constituent features for efficient candidate selection.
"""

def __init__(self, rules: Iterable["capa.rules.Rule"]):
self.features: dict[Feature, list["capa.rules.Rule"]] = collections.defaultdict(list)
# map from prefix byte (or -1 for empty) to rules containing that byte feature
self.bytes_prefix_index: dict[int, list["capa.rules.Rule"]] = collections.defaultdict(list)

for rule in rules:
for feature in rule.extract_all_features():
self._index_rule_by_feature(rule, feature)

def _index_rule_by_feature(self, rule: "capa.rules.Rule", feature: Feature):
if isinstance(feature, capa.features.common.Bytes):
# build the prefix index directly, removing one full pass
prefix = feature.value[0] if len(feature.value) > 0 else -1
self.bytes_prefix_index[prefix].append(rule)
else:
self.features[feature].append(rule)

def get_candidates(self, features: FeatureSet) -> set["capa.rules.Rule"]:
candidates = set()

for feature in features:
if feature in self.features:
candidates.update(self.features[feature])

if isinstance(feature, capa.features.common.Bytes):
# Bytes.value type is now narrowed via class-level annotation in common.py
prefix = feature.value[0] if len(feature.value) > 0 else -1
if prefix in self.bytes_prefix_index:
candidates.update(self.bytes_prefix_index[prefix])

# guard to avoid temporary object creation for the short-pattern fallback
if -1 in self.bytes_prefix_index:
candidates.update(self.bytes_prefix_index[-1])

return candidates


MatchResults = Mapping[str, list[tuple[Address, Result]]]


Expand All @@ -289,10 +318,7 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
"""
record into the given featureset that the given rule matched at the given locations.

naively, this is just adding a MatchedRule feature;
however, we also want to record matches for the rule's namespaces.

updates `features` in-place. doesn't modify the remaining arguments.
updates `features` in-place.
"""
features[capa.features.common.MatchedRule(rule.name)].update(locations)
for namespace in get_rule_namespaces(rule):
Expand All @@ -301,46 +327,26 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:

def match(rules: list["capa.rules.Rule"], features: FeatureSet, addr: Address) -> tuple[FeatureSet, MatchResults]:
"""
match the given rules against the given features,
returning an updated set of features and the matches.

the updated features are just like the input,
but extended to include the match features (e.g. names of rules that matched).
the given feature set is not modified; an updated copy is returned.

the given list of rules must be ordered topologically by dependency,
or else `match` statements will not be handled correctly.

this routine should be fairly optimized, but is not guaranteed to be the fastest matcher possible.
it has a particularly convenient signature: (rules, features) -> matches
other strategies can be imagined that match differently; implement these elsewhere.
specifically, this routine does "top down" matching of the given rules against the feature set.
match the given rules against the given features.
"""
results: MatchResults = collections.defaultdict(list)

# copy features so that we can modify it
# without affecting the caller (keep this function pure)
#
# note: copy doesn't notice this is a defaultdict, so we'll recreate that manually.
features = collections.defaultdict(set, copy.copy(features))

# create an index for efficient candidate rule selection
index = _RuleFeatureIndex(rules)
candidates = index.get_candidates(features)
Comment on lines +336 to +337
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Instantiating _RuleFeatureIndex(rules) inside the match function introduces a significant performance regression. The match function is called for every scope (file, function, basic block) during analysis. Re-indexing the entire rule set on every call is computationally expensive ($O(Rules \times Features)$) and likely far outweighs the benefit of skipping rule.evaluate calls, especially since evaluate is already optimized for short-circuiting. The index should be constructed once and reused, or this change should be reverted.


# only evaluate rules that could potentially match based on the feature set
for rule in rules:
if rule not in candidates:
continue

res = rule.evaluate(features, short_circuit=True)
if res:
# we first matched the rule with short circuiting enabled.
# this is much faster than without short circuiting.
# however, we want to collect all results thoroughly,
# so once we've found a match quickly,
# go back and capture results without short circuiting.
res = rule.evaluate(features, short_circuit=False)

# sanity check
assert bool(res) is True

results[rule.name].append((addr, res))
# we need to update the current `features`
# because subsequent iterations of this loop may use newly added features,
# such as rule or namespace matches.
index_rule_matches(features, rule, [addr])

return (features, results)
return (features, results)
5 changes: 1 addition & 4 deletions capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,6 @@ def __init__(self, value: bytes, description=None):
self.value = value

def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
assert isinstance(self.value, bytes)

capa.perf.counters["evaluate.feature"] += 1
capa.perf.counters["evaluate.feature.bytes"] += 1
capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1
Expand All @@ -432,7 +430,6 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
if not isinstance(feature, (Bytes,)):
continue

assert isinstance(feature.value, bytes)
if feature.value.startswith(self.value):
return Result(True, self, [], locations=locations)

Expand Down Expand Up @@ -537,4 +534,4 @@ def is_global_feature(feature):
is this a feature that is extracted at every scope?
today, these are OS, arch, and format features.
"""
return isinstance(feature, (OS, Arch, Format))
return isinstance(feature, (OS, Arch, Format))
22 changes: 14 additions & 8 deletions capa/features/extractors/ghidra/basicblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def is_printable_ascii(chars_: bytes):
def is_printable_utf16le(chars_: bytes):
if all(c == 0x00 for c in chars_[1::2]):
return is_printable_ascii(chars_[::2])
return False

if is_printable_ascii(chars):
return op_byte_len
Expand All @@ -72,7 +73,9 @@ def is_mov_imm_to_stack(insn: ghidra.program.database.code.InstructionDB) -> boo

# MOV dword ptr [EBP + local_*], 0x65
if insn.getMnemonicString().startswith("MOV"):
found = all(insn.getOperandType(i) == mov_its_ops[i] for i in range(2))
# Use simple bounds check to ensure we don't index out of range
if insn.getNumOperands() >= 2:
found = all(insn.getOperandType(i) == mov_its_ops[i] for i in range(2))

return found

Expand All @@ -95,12 +98,15 @@ def _bb_has_tight_loop(bb: ghidra.program.model.block.CodeBlock):
"""
parse tight loops, true if last instruction in basic block branches to bb start
"""
# Reverse Ordered, first InstructionDB
last_insn = (
capa.features.extractors.ghidra.helpers.get_current_program().getListing().getInstructions(bb, False).next()
)
# Reverse Ordered, first InstructionDB (which is the last instruction of the block)
instructions = capa.features.extractors.ghidra.helpers.get_current_program().getListing().getInstructions(bb, False)
if not instructions.hasNext():
return False

last_insn = instructions.next()

if last_insn.getFlowType().isJump():
# Check if the destination of the jump is the start of this block
return last_insn.getAddress(0) == bb.getMinAddress()

return False
Expand Down Expand Up @@ -133,12 +139,12 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Featur
extract features from the given basic block.

args:
bb: the basic block to process.
bbh: the basic block handle to process.

yields:
tuple[Feature, int]: the features and their location found in this basic block.
tuple[Feature, Address]: the features and their location found in this basic block.
"""
yield BasicBlock(), bbh.address
for bb_handler in BASIC_BLOCK_HANDLERS:
for feature, addr in bb_handler(fh, bbh):
yield feature, addr
yield feature, addr
10 changes: 8 additions & 2 deletions capa/features/extractors/ghidra/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@ def extract_function_calls_to(fh: FunctionHandle):


def extract_function_loop(fh: FunctionHandle):
"""extract loop characteristics from the function's basic blocks"""
f: "ghidra.program.database.function.FunctionDB" = fh.inner

edges = []
# This uses the function body; our fix in helpers.py ensures
# that fh.inner.getBody() contains the correct address set.
for block in SimpleBlockIterator(
BasicBlockModel(capa.features.extractors.ghidra.helpers.get_current_program()),
f.getBody(),
Expand All @@ -45,14 +48,16 @@ def extract_function_loop(fh: FunctionHandle):
s_addrs = block.getStartAddresses()

while dests.hasNext():
dest_addr = dests.next().getDestinationAddress().getOffset()
for addr in s_addrs:
edges.append((addr.getOffset(), dests.next().getDestinationAddress().getOffset()))
edges.append((addr.getOffset(), dest_addr))

if loops.has_loop(edges):
yield Characteristic("loop"), AbsoluteVirtualAddress(f.getEntryPoint().getOffset())


def extract_recursive_call(fh: FunctionHandle):
"""extract recursive function calls"""
f: "ghidra.program.database.function.FunctionDB" = fh.inner

for func in f.getCalledFunctions(capa.features.extractors.ghidra.helpers.get_monitor()):
Expand All @@ -61,9 +66,10 @@ def extract_recursive_call(fh: FunctionHandle):


def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]:
"""yield function features"""
for function_handler in FUNCTION_HANDLERS:
for feature, addr in function_handler(fh):
yield feature, addr


FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call)
10 changes: 6 additions & 4 deletions capa/features/extractors/ghidra/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,12 @@ def get_function_symbols():

def get_function_blocks(fh: "capa.features.extractors.base_extractor.FunctionHandle") -> Iterator[BBHandle]:
"""
yield the basic blocks of the function
yield the basic blocks of the function using a flow-insensitive model
to prevent truncation at misidentified non-returning calls.
"""

for block in SimpleBlockIterator(BasicBlockModel(get_current_program()), fh.inner.getBody(), get_monitor()):
model = BasicBlockModel(get_current_program())
# Restoration of original logic with the flow-insensitive fix:
for block in model.getCodeBlocksContaining(fh.inner.getBody(), get_monitor()):
yield BBHandle(address=AbsoluteVirtualAddress(block.getMinAddress().getOffset()), inner=block)


Expand Down Expand Up @@ -332,4 +334,4 @@ def find_data_references_from_insn(insn, max_depth: int = 10):
else:
break

yield to_addr
yield to_addr
2 changes: 1 addition & 1 deletion capa/rules/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,4 @@ def generate_rule_cache(rules_dir: Path, cache_dir: Path) -> bool:
assert path.exists()
logger.info("rules cache saved to: %s", path)

return True
return True
3 changes: 2 additions & 1 deletion tests/test_ghidra_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import capa.features.common

# Ensure Ghidra environment is actually available before running
ghidra_present = importlib.util.find_spec("pyghidra") is not None and "GHIDRA_INSTALL_DIR" in os.environ


Expand Down Expand Up @@ -49,4 +50,4 @@ def test_ghidra_features(sample, scope, feature, expected):
"sample,scope,feature,expected", fixtures.FEATURE_COUNT_TESTS_GHIDRA, indirect=["sample", "scope"]
)
def test_ghidra_feature_counts(sample, scope, feature, expected):
fixtures.do_test_feature_count(fixtures.get_ghidra_extractor, sample, scope, feature, expected)
fixtures.do_test_feature_count(fixtures.get_ghidra_extractor, sample, scope, feature, expected)
Loading