From 62ab8a15417608ba49348effb2a6f617f7c43ec1 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:05:55 +0100 Subject: [PATCH] Add csv query reports to anomaly detection --- .../anomalyDetectionPython.sh | 21 ++++++++++---- .../AnomalyDetectionTopAnomalies.cypher | 29 +++++++++++++++++++ .../tunedAnomalyDetectionExplained.py | 3 ++ 3 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index f6f661814..67e0465a3 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -162,14 +162,23 @@ anomaly_detection_labels() { local language language=$( extractQueryParameter "projection_language" "${@}" ) - + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..." + + # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...) + local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}" + mkdir -p "${detail_report_directory}" + execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}" - execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" - execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" - execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" - execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}" - execute_cypher_summarized "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv" + # The following two label types require Python scripts to run first. + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBridge.csv" + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopOutlier.csv" + # Output the top anomalies and their archetype + rank + execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionTopAnomalies.cypher" "${@}" > "${detail_report_directory}/TopAnomalies.csv" + } # Run the anomaly detection pipeline. diff --git a/domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher b/domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher new file mode 100644 index 000000000..a213256ea --- /dev/null +++ b/domains/anomaly-detection/labels/AnomalyDetectionTopAnomalies.cypher @@ -0,0 +1,29 @@ +// List top anomalies + + MATCH (codeUnit) + WHERE $projection_node_label IN labels(codeUnit) + AND codeUnit.anomalyScore > 0 + AND codeUnit.anomalyLabel = 1 + ORDER BY codeUnit.anomalyScore DESC + LIMIT 50 +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + WITH *, coalesce(artifactName, projectName) AS projectName + RETURN projectName + ,codeUnit.name AS shortCodeUnitName + ,coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.anomalyRank AS anomalyRank + ,codeUnit.anomalyScore AS anomalyScore + ,coalesce(codeUnit.anomalyAuthorityRank, 0) AS authorityRank + ,coalesce(codeUnit.anomalyBottleneckRank, 0) AS bottleneckRank + ,coalesce(codeUnit.anomalyBridgeRank, 0) AS bridgeRank + ,coalesce(codeUnit.anomalyHubRank, 0) AS hubRank + ,coalesce(codeUnit.anomalyOutlierRank, 0) AS outlierRank + ,codeUnit.anomalyTopFeature1 AS topFeature1 + ,codeUnit.anomalyTopFeature2 AS topFeature2 + ,codeUnit.anomalyTopFeature3 AS topFeature3 + ,codeUnit.anomalyTopFeatureSHAPValue1 AS topFeature1Score + ,codeUnit.anomalyTopFeatureSHAPValue2 AS topFeature2Score + ,codeUnit.anomalyTopFeatureSHAPValue3 AS topFeature3Score diff --git a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py index 83daea96c..fda74f9a8 100755 --- a/domains/anomaly-detection/tunedAnomalyDetectionExplained.py +++ b/domains/anomaly-detection/tunedAnomalyDetectionExplained.py @@ -532,6 +532,7 @@ def add_anomaly_detection_results_to_features( anomaly_detection_results: AnomalyDetectionResults, anomaly_label_column: str = 'anomalyLabel', anomaly_score_column: str = 'anomalyScore', + anomaly_rank_column: str = 'anomalyRank' ) -> pd.DataFrame: """ Adds anomaly detection results to the feature and returns the updated dataframe. @@ -549,6 +550,7 @@ def add_anomaly_detection_results_to_features( # Add anomaly labels and scores to the feature matrix features[anomaly_label_column] = anomaly_detection_results.anomaly_labels features[anomaly_score_column] = anomaly_detection_results.anomaly_scores + features[anomaly_rank_column] = features[anomaly_score_column].rank(method='dense', ascending=False).astype(int) return features @@ -1250,6 +1252,7 @@ def output_top_shap_explained_global_features_as_markdown_table( 'nodeElementId': features["nodeElementId"], 'anomalyLabel': features['anomalyLabel'].astype(int), 'anomalyScore': features['anomalyScore'], + 'anomalyRank': features['anomalyRank'], 'anomalyTopFeature1': features['anomalyTopFeature_1'], 'anomalyTopFeature2': features['anomalyTopFeature_2'], 'anomalyTopFeature3': features['anomalyTopFeature_3'],