From 62ab8a15417608ba49348effb2a6f617f7c43ec1 Mon Sep 17 00:00:00 2001
From: JohT <7671054+JohT@users.noreply.github.com>
Date: Mon, 15 Dec 2025 11:05:55 +0100
Subject: [PATCH] Add csv query reports to anomaly detection

---
 .../anomalyDetectionPython.sh                 | 21 ++++++++++----
 .../AnomalyDetectionTopAnomalies.cypher       | 29 +++++++++++++++++++
 .../tunedAnomalyDetectionExplained.py         |  3 ++
 3 files changed, 47 insertions(+), 6 deletions(-)
 create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher

diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh
index f6f661814..67e0465a3 100755
--- a/domains/anomaly-detection/anomalyDetectionPython.sh
+++ b/domains/anomaly-detection/anomalyDetectionPython.sh
@@ -162,14 +162,23 @@ anomaly_detection_labels() {
     
     local language
     language=$( extractQueryParameter "projection_language" "${@}" )
-    
+
     echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
+    
+    # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
+    local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
+    mkdir -p "${detail_report_directory}"
+
     execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
-    execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}"
-    execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}"
-    execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}"
-    execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
-    execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
+    # The following two label types require Python scripts to run first.
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBridge.csv"
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopOutlier.csv"
+    # Output the top anomalies and their archetype + rank
+    execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionTopAnomalies.cypher" "${@}" > "${detail_report_directory}/TopAnomalies.csv"
+
 }
 
 # Run the anomaly detection pipeline.
diff --git a/domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher b/domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher
new file mode 100644
index 000000000..a213256ea
--- /dev/null
+++ b/domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher
@@ -0,0 +1,29 @@
+// List top anomalies
+
+   MATCH (codeUnit)
+   WHERE $projection_node_label IN labels(codeUnit)
+     AND codeUnit.anomalyScore > 0
+     AND codeUnit.anomalyLabel = 1
+   ORDER BY codeUnit.anomalyScore DESC
+   LIMIT 50
+OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit)
+    WITH *, artifact.name AS artifactName
+OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit)
+    WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName
+    WITH *, coalesce(artifactName, projectName)            AS projectName
+  RETURN projectName
+        ,codeUnit.name                                     AS shortCodeUnitName
+        ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName
+        ,codeUnit.anomalyRank                              AS anomalyRank
+        ,codeUnit.anomalyScore                             AS anomalyScore
+        ,coalesce(codeUnit.anomalyAuthorityRank, 0)        AS authorityRank
+        ,coalesce(codeUnit.anomalyBottleneckRank, 0)       AS bottleneckRank
+        ,coalesce(codeUnit.anomalyBridgeRank, 0)           AS bridgeRank
+        ,coalesce(codeUnit.anomalyHubRank, 0)              AS hubRank
+        ,coalesce(codeUnit.anomalyOutlierRank, 0)          AS outlierRank
+        ,codeUnit.anomalyTopFeature1                       AS topFeature1
+        ,codeUnit.anomalyTopFeature2                       AS topFeature2
+        ,codeUnit.anomalyTopFeature3                       AS topFeature3
+        ,codeUnit.anomalyTopFeatureSHAPValue1              AS topFeature1Score
+        ,codeUnit.anomalyTopFeatureSHAPValue2              AS topFeature2Score
+        ,codeUnit.anomalyTopFeatureSHAPValue3              AS topFeature3Score
diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py
index 83daea96c..fda74f9a8 100755
--- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py
+++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py
@@ -532,6 +532,7 @@ def add_anomaly_detection_results_to_features(
     anomaly_detection_results: AnomalyDetectionResults,
     anomaly_label_column: str = 'anomalyLabel',
     anomaly_score_column: str = 'anomalyScore',
+    anomaly_rank_column: str = 'anomalyRank'
 ) -> pd.DataFrame:
     """
     Adds anomaly detection results to the feature and returns the updated dataframe.
@@ -549,6 +550,7 @@ def add_anomaly_detection_results_to_features(
     # Add anomaly labels and scores to the feature matrix
     features[anomaly_label_column] = anomaly_detection_results.anomaly_labels
     features[anomaly_score_column] = anomaly_detection_results.anomaly_scores
+    features[anomaly_rank_column] = features[anomaly_score_column].rank(method='dense', ascending=False).astype(int)
     return features
 
 
@@ -1250,6 +1252,7 @@ def output_top_shap_explained_global_features_as_markdown_table(
     'nodeElementId': features["nodeElementId"],
     'anomalyLabel': features['anomalyLabel'].astype(int),
     'anomalyScore': features['anomalyScore'],
+    'anomalyRank': features['anomalyRank'],
     'anomalyTopFeature1': features['anomalyTopFeature_1'],
     'anomalyTopFeature2': features['anomalyTopFeature_2'],
     'anomalyTopFeature3': features['anomalyTopFeature_3'],