Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 43 additions & 59 deletions dojo/finding/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
do_false_positive_history,
get_current_user,
get_object_or_none,
mass_model_updater,
to_str_typed,
)

Expand Down Expand Up @@ -578,20 +577,6 @@ def finding_post_delete(sender, instance, **kwargs):
logger.debug("finding post_delete, sender: %s instance: %s", to_str_typed(sender), to_str_typed(instance))


def reset_duplicate_before_delete(dupe):
dupe.duplicate_finding = None
dupe.duplicate = False


def reset_duplicates_before_delete(qs):
mass_model_updater(Finding, qs, reset_duplicate_before_delete, fields=["duplicate", "duplicate_finding"])


def set_new_original(finding, new_original):
if finding.duplicate:
finding.duplicate_finding = new_original


# can't use model to id here due to the queryset
# @dojo_async_task
# @app.task
Expand All @@ -617,64 +602,58 @@ def reconfigure_duplicate_cluster(original, cluster_outside):
new_original.save_no_options()
new_original.found_by.set(original.found_by.all())

# if the cluster is size 1, there's only the new original left
# Re-point remaining duplicates to the new original in a single query
if new_original and len(cluster_outside) > 1:
# for find in cluster_outside:
# if find != new_original:
# find.duplicate_finding = new_original
# find.save_no_options()

mass_model_updater(Finding, cluster_outside, lambda f: set_new_original(f, new_original), fields=["duplicate_finding"])
cluster_outside.exclude(id=new_original.id).update(duplicate_finding=new_original)


def prepare_duplicates_for_delete(test=None, engagement=None):
logger.debug("prepare duplicates for delete, test: %s, engagement: %s", test.id if test else None, engagement.id if engagement else None)
if test is None and engagement is None:
logger.warning("nothing to prepare as test and engagement are None")
return

# should not be needed in normal healthy instances.
# but in that case it's a cheap count query and we might as well run it to be safe
fix_loop_duplicates()

# get all originals in the test/engagement
originals = Finding.objects.filter(original_finding__isnull=False)
# Build scope filter
scope_filter = {}
if engagement:
originals = originals.filter(test__engagement=engagement)
scope_filter["test__engagement"] = engagement
if test:
originals = originals.filter(test=test)
scope_filter["test"] = test

# use distinct to flatten the join result
originals = originals.distinct()

if len(originals) == 0:
logger.debug("no originals found, so no duplicates to prepare for deletion of original")
scope_finding_ids = set(
Finding.objects.filter(**scope_filter).values_list("id", flat=True),
)
if not scope_finding_ids:
logger.debug("no findings in scope, nothing to prepare")
return

# remove the link to the original from the duplicates inside the cluster so they can be safely deleted by the django framework
total = len(originals)
# logger.debug('originals: %s', [original.id for original in originals])
for i, original in enumerate(originals):
logger.debug("%d/%d: preparing duplicate cluster for deletion of original: %d", i + 1, total, original.id)
cluster_inside = original.original_finding.all()
if engagement:
cluster_inside = cluster_inside.filter(test__engagement=engagement)

if test:
cluster_inside = cluster_inside.filter(test=test)

if len(cluster_inside) > 0:
reset_duplicates_before_delete(cluster_inside)

# reconfigure duplicates outside test/engagement
cluster_outside = original.original_finding.all()
if engagement:
cluster_outside = cluster_outside.exclude(test__engagement=engagement)

if test:
cluster_outside = cluster_outside.exclude(test=test)

if len(cluster_outside) > 0:
reconfigure_duplicate_cluster(original, cluster_outside)

logger.debug("done preparing duplicate cluster for deletion of original: %d", original.id)
# Bulk-reset inside-scope duplicates: single UPDATE instead of per-original mass_model_updater.
# Clears the duplicate_finding FK so Django's Collector won't trip over dangling references
# when deleting findings in this scope.
inside_reset_count = Finding.objects.filter(
duplicate=True,
duplicate_finding_id__in=scope_finding_ids,
id__in=scope_finding_ids,
).update(duplicate_finding=None, duplicate=False)
logger.debug("bulk-reset %d inside-scope duplicates", inside_reset_count)

# Reconfigure outside-scope duplicates: still per-original because each cluster
# needs a new original chosen, status copied, and found_by updated.
# Pre-filter to only originals that have at least one duplicate outside scope,
# avoiding a per-original .exists() check.
originals_with_outside_dupes = Finding.objects.filter(
id__in=scope_finding_ids,
original_finding__in=Finding.objects.exclude(id__in=scope_finding_ids),
).distinct().prefetch_related("original_finding")

for original in originals_with_outside_dupes:
# Inside-scope duplicates were already unlinked by the bulk UPDATE above,
# so original_finding.all() now only contains outside-scope duplicates.
reconfigure_duplicate_cluster(original, original.original_finding.all())


@receiver(pre_delete, sender=Test)
Expand Down Expand Up @@ -709,9 +688,10 @@ def fix_loop_duplicates():
loop_count = loop_qs.count()

if loop_count > 0:
deduplicationLogger.info(f"Identified {loop_count} Findings with Loops")
deduplicationLogger.warning("fix_loop_duplicates: found %d findings with duplicate loops", loop_count)
# Stream IDs only in descending order to avoid loading full Finding rows
for find_id in loop_qs.order_by("-id").values_list("id", flat=True).iterator(chunk_size=1000):
deduplicationLogger.warning("fix_loop_duplicates: fixing loop for finding %d", find_id)
removeLoop(find_id, 50)

new_originals = Finding.objects.filter(duplicate_finding__isnull=True, duplicate=True)
Expand All @@ -726,6 +706,10 @@ def fix_loop_duplicates():


def removeLoop(finding_id, counter):
# NOTE: This function is recursive and does per-finding DB queries without prefetching.
# It could be optimized to load the duplicate graph as ID pairs in memory and process
# in bulk, but loops are rare (only from past bugs or high parallel load) so the
# current implementation is acceptable.
# get latest status
finding = Finding.objects.get(id=finding_id)
real_original = finding.duplicate_finding
Expand Down
Loading
Loading