spcl · mcopik · Nov 4, 2025 · Nov 4, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -188,3 +188,14 @@ cache
 # IntelliJ IDEA files
 .idea
 *.iml
+
+# VS-Code files
+.vscode/
+
+# Local config
+config/config.json
+config/redis
+
+# Benchmark Results
+results/
+experiments.json
diff --git a/benchmarks-data b/benchmarks-data
diff --git a/benchmarks/600.workflows/610.gen/config.json b/benchmarks/600.workflows/610.gen/config.json
@@ -1,5 +1,6 @@
 {
   "timeout": 120,
   "memory": 128,
-  "languages": ["python"]
+  "languages": ["python"],
+  "modules": []
 }
diff --git a/benchmarks/600.workflows/610.gen/definition.json b/benchmarks/600.workflows/610.gen/definition.json
@@ -37,10 +37,10 @@
         "map_astros": {
             "type": "map",
             "array": "astros.people",
-            "root": "map_astros",
+            "root": "map",
             "next": "process_astros",
             "states": {
-              "map_astros": {
+              "map": {
                 "type": "task",
                 "func_name": "map_astros"
               }

diff --git a/benchmarks/600.workflows/610.gen/input.py b/benchmarks/600.workflows/610.gen/input.py
@@ -1,5 +1,5 @@
 def buckets_count():
     return (0, 0)
 
-def generate_input(data_dir, size, input_buckets, output_buckets, upload_func):
+def generate_input(data_dir, size, benchmarks_bucket,input_buckets, output_buckets, upload_func, nosql_func):
     return dict()
diff --git a/benchmarks/600.workflows/610.gen/python/map_astros.py b/benchmarks/600.workflows/610.gen/python/map_astros.py
@@ -1,7 +1,6 @@
 def handler(elem):
-    name = elem["name"]
-    fn, ln = name.split(" ")
-    name = " ".join([ln, fn])
-    elem["name_rev"] = name
-
+    full_name:str = elem["name"]
+    names = full_name.split(" ")
+    names.reverse()
+    elem["name_rev"] = " ".join(names)
     return elem
diff --git a/benchmarks/600.workflows/610.gen/python/requirements.txt b/benchmarks/600.workflows/610.gen/python/requirements.txt
@@ -0,0 +1 @@
+requests
diff --git a/benchmarks/600.workflows/6300.sda-workflow/.gitignore b/benchmarks/600.workflows/6300.sda-workflow/.gitignore
@@ -0,0 +1,2 @@
+cfg/
+dev/
diff --git a/benchmarks/600.workflows/6300.sda-workflow/config.json b/benchmarks/600.workflows/6300.sda-workflow/config.json
@@ -0,0 +1,7 @@
+{
+  "timeout": 500,
+  "memory": 128,
+  "languages": ["python"],
+  "modules": ["storage"],
+  "container-image": "logru/sda-no-db:latest"
+}
diff --git a/benchmarks/600.workflows/6300.sda-workflow/definition.json b/benchmarks/600.workflows/6300.sda-workflow/definition.json
@@ -0,0 +1,70 @@
+{
+    "root": "clearDB",
+    "states": {
+        "clearDB":{
+            "type": "task",
+            "func_name": "clearDB",
+            "next": "split"
+        },
+        "split": {
+            "type": "task",
+            "func_name": "split",
+            "next": "filter-map"
+        },
+        "filter-map":{
+            "type": "map",
+            "root": "filter",
+            "array": "filter_workloads",
+            "next": "prepare-neighbors",
+            "states": {
+                "filter": {
+                    "type": "task",
+                    "func_name": "filter"
+                }
+            }
+        },
+        "prepare-neighbors": {
+            "type": "task",
+            "func_name": "pre_neighbors",
+            "next": "neighbors-map"
+        },
+        "neighbors-map":{
+            "type": "map",
+            "root": "neighbors",
+            "array": "neighbors_workloads",
+            "next": "components",
+            "states": {
+                "neighbors": {
+                    "type": "task",
+                    "func_name": "neighbors"
+                }
+            }
+        },
+        "components": {
+            "type": "task",
+            "func_name": "components",
+            "next": "cluster-analyze-map"
+        },
+        "cluster-analyze-map":{
+            "type": "map",
+            "root": "clustering",
+            "array": "cluster_workloads",
+            "next": "merge-results",
+            "states": {
+                "clustering": {
+                    "type": "task",
+                    "func_name": "clustering",
+                    "next": "analysis"
+                },
+                "analysis": {
+                    "type": "task",
+                    "func_name": "analysis"
+                }
+            }
+        },
+        "merge-results": {
+            "type": "task",
+            "func_name": "merge"
+        }
+    }
+}
diff --git a/benchmarks/600.workflows/6300.sda-workflow/input.py b/benchmarks/600.workflows/6300.sda-workflow/input.py
@@ -0,0 +1,135 @@
+import json
+import os
+
+class MemgraphConfig:
+    def __init__(self, host:str, port:int, username:str, password:str):
+        self.host = host
+        self.port = port
+        self.username = username
+        self.password = password
+        if self.host == "localhost":
+            raise ValueError("Memgraph database for SDA workflow has to be reachable from the internet. Set the following environment variables via the command line or the SeBS .env file: \n\tMEMGRAPH_HOST\n\tMEMGRAPH_PORT\n\tMEMGRAPH_USER\n\tMEMGRAPH_PASSWORD")
+
+    @staticmethod
+    def from_env():
+        with open(".env", "r") as f:
+            for line in f:
+                key, value = line.strip().split("=", 1)
+                os.environ[key] = value
+        return MemgraphConfig(
+            host=os.getenv("MEMGRAPH_HOST", "localhost"),
+            port=int(os.getenv("MEMGRAPH_PORT", 7687)),
+            username=os.getenv("MEMGRAPH_USER", ""),
+            password=os.getenv("MEMGRAPH_PASSWORD", "")
+        )
+
+class SDAConfig:
+    def __init__(self, memgraph_config: MemgraphConfig, splits:int=0, required_area:float=0.0,
+                  max_edge_distance:float=0.0, max_neighbours: int = 5, clustering_distance: float = 500.0,merge_workers:int=2, visualize_edges:bool= True):
+        self.memgraph_config = memgraph_config
+        self.splits = splits
+        self.required_area = required_area
+        self.max_edge_distance = max_edge_distance
+        self.max_neighbours = max_neighbours
+        self.clustering_distance = clustering_distance
+        self.merge_workers = merge_workers
+        self.visualize_edges = visualize_edges
+
+    def get(self):
+        return  {
+            "binary-filters": [
+                {
+                    "name": "InsidePolygonFilter"
+                }
+            ],
+            "centrality-measures": [
+                {
+                    "name": "DegreeCentrality"
+                },
+                {
+                    "name": "MeanLocalSignificance"
+                },
+                {
+                    "name": "SmallerNeighboursRatio"
+                }
+            ],
+            "contraction-predicates": [
+                {
+                    "distance": self.clustering_distance,
+                    "name": "DistanceBiPredicate"
+                }
+            ],
+            "maxDistanceMeters": self.max_edge_distance,
+            "maxNeighbours": self.max_neighbours,
+            "memgraph-host": self.memgraph_config.host,
+            "memgraph-port": self.memgraph_config.port,
+            "memgraph-user": self.memgraph_config.username,
+            "memgraph-password": self.memgraph_config.password,
+            "merge-workers": self.merge_workers,
+            "neighbouring-predicates": [],
+            "splits": self.splits,
+            "unary-filters": [
+                {
+                    "name": "ApproxAreaFilter",
+                    "requiredArea": self.required_area
+                }
+            ],
+            "visualize-edges": self.visualize_edges
+        }
+
+    @staticmethod
+    def from_benchmark_size(size:str):
+        memgraph_config:MemgraphConfig = MemgraphConfig.from_env()
+        configs = {
+            "test": SDAConfig(memgraph_config, splits=0, required_area=5000.0, max_edge_distance=3000.0, max_neighbours=5, clustering_distance=500.0, merge_workers=1, visualize_edges=False),
+            "small": SDAConfig(memgraph_config, splits=1, required_area=500.0, max_edge_distance=1000.0, max_neighbours=5, clustering_distance=500.0, merge_workers=2, visualize_edges=True),
+            "large": SDAConfig(memgraph_config, splits=2, required_area=500.0, max_edge_distance=500.0, max_neighbours=5, clustering_distance=200.0, merge_workers=2, visualize_edges=True),
+        }
+        return configs[size]
+
+def get_config_file_name(size):
+    return f"sda-config-{size}.json"
+
+def create_config_file(size):
+    config = SDAConfig.from_benchmark_size(size)
+    cfg_dir = os.path.join(os.path.dirname(__file__), "cfg")
+    os.makedirs(cfg_dir, exist_ok=True)
+    config_file_path = os.path.join(cfg_dir, get_config_file_name(size))
+    with open(config_file_path, "w") as f:
+        json.dump(config.get(), f, indent=4)
+    return config_file_path
+
+def get_input_file(size):
+    input_files = {
+        "test" : "Corvara_IT.tiff",
+        "small": "Corvara_IT.tiff",
+        "large": "Wuerzburg_DE.tiff",
+    }
+    return input_files[size]
+
+def buckets_count():
+    return (1, 5)
+
+def upload_all_data(upload_func,data_dir):
+    sizes=["test", "small", "large"]
+    for size in sizes:
+        input_file = get_input_file(size)
+        config_path = create_config_file(size)
+        upload_func(0, input_file, os.path.join(data_dir, input_file))
+        upload_func(0, get_config_file_name(size), config_path)
+
+
+
+def generate_input(data_dir, size, benchmarks_bucket,input_buckets, output_buckets, upload_func, nosql_func):
+    upload_all_data(upload_func,data_dir)
+    return {
+        "config_file": get_config_file_name(size),
+        "input_file": get_input_file(size),
+        "input_bucket": input_buckets[0],
+        "split_output_bucket": output_buckets[0],
+        "filter_output_bucket": output_buckets[1],
+        "cluster_output_bucket": output_buckets[2],
+        "analysis_output_bucket": output_buckets[3],
+        "final_output_bucket": output_buckets[4],
+        "benchmark_bucket": benchmarks_bucket
+    }
diff --git a/benchmarks/600.workflows/6300.sda-workflow/python/SDAHelper.py b/benchmarks/600.workflows/6300.sda-workflow/python/SDAHelper.py
@@ -0,0 +1,40 @@
+import os
+import uuid
+from pathlib import Path
+from . import storage
+
+storage_client = storage.storage.get_instance()
+
+SHP_SUFFIX = [".shp", ".shx", ".dbf", ".prj"]
+
+def download_file(benchmark_bucket, path_in_bucket, dest_dir):
+    path = Path(dest_dir) / Path(path_in_bucket).name
+    storage_client.download(benchmark_bucket, path_in_bucket, path)
+    return path
+
+def download_file_bucket(benchmark_bucket, bucket, basename, dest_dir):
+    return download_file(benchmark_bucket, bucket + '/' + basename, dest_dir)
+
+def download_shp_file(benchmark_bucket, bucket ,shp_file, dest_dir):
+    files = [Path(shp_file).with_suffix(suffix) for suffix in filter(lambda x: x is not ".shp",SHP_SUFFIX)]
+    for f in files:
+        download_file_bucket(benchmark_bucket, bucket, f.name, dest_dir)
+    return download_file_bucket(benchmark_bucket, bucket, shp_file, dest_dir)
+
+def load_config(event,directory):
+    return download_file_bucket(event["benchmark_bucket"], event["input_bucket"], event["config_file"], directory)
+
+def upload_shp_file(benchmark_bucket, bucket, shp_basename):
+    shp_dir = Path(shp_basename).parent
+    for f in shp_dir.iterdir():
+        if Path(shp_basename).stem == Path(f).stem and any(f.name.endswith(suffix) for suffix in SHP_SUFFIX):
+            full_path =  shp_dir / f.name
+            storage_client.upload(benchmark_bucket, bucket + '/' + f.name, full_path,False)
+
+def download_directory(benchmark_bucket, bucket, dest_dir):
+    storage_client.download_directory(benchmark_bucket, bucket, dest_dir)
+
+def create_tmp_dir():
+    tmp_dir = os.path.join("/tmp",str(uuid.uuid4()))
+    os.makedirs(tmp_dir, exist_ok=True)
+    return tmp_dir
diff --git a/benchmarks/600.workflows/6300.sda-workflow/python/analysis.py b/benchmarks/600.workflows/6300.sda-workflow/python/analysis.py
@@ -0,0 +1,22 @@
+import subprocess
+from .SDAHelper import *
+
+def handler(event):
+    benchmark_bucket = event["benchmark_bucket"]
+    cluster_output_bucket = event["cluster_output_bucket"]
+    TMP_DIR = create_tmp_dir()
+    analysis_input_file = download_shp_file(benchmark_bucket,cluster_output_bucket,event["cluster_output_file"],TMP_DIR)
+    config_file = load_config(event, TMP_DIR)
+    OUTPUT_STEM = "Analysis_"+Path(analysis_input_file).stem
+    command = ["SettlementDelineationAnalysis", "-i", str(analysis_input_file), "-c", str(config_file), "--outputStem", OUTPUT_STEM]
+    result = subprocess.run(command,capture_output=True,text=True, cwd=TMP_DIR)
+    if result.returncode != 0:
+        event["stdout"] = result.stdout
+        event["stderr"] = result.stderr
+        return event
+    event.pop("cluster_output_file", None)
+    event["analysis_output_files"]= []
+    for file in Path(TMP_DIR).glob(f"{OUTPUT_STEM}*.shp"):
+        upload_shp_file(benchmark_bucket, event["analysis_output_bucket"],file)
+        event["analysis_output_files"].append(file.name)
+    return event
diff --git a/benchmarks/600.workflows/6300.sda-workflow/python/clearDB.py b/benchmarks/600.workflows/6300.sda-workflow/python/clearDB.py
@@ -0,0 +1,11 @@
+from .SDAHelper import *
+import subprocess
+
+def handler(event):
+    TMP_DIR = create_tmp_dir()
+    config = load_config(event, TMP_DIR)
+    result = subprocess.run([f"AfricapolisClearDatabase", "-c", str(config)],capture_output=True,text=True)
+    if result.returncode != 0:
+        event["stdout"] = result.stdout
+        event["stderr"] = result.stderr
+    return event
diff --git a/benchmarks/600.workflows/6300.sda-workflow/python/clustering.py b/benchmarks/600.workflows/6300.sda-workflow/python/clustering.py
@@ -0,0 +1,28 @@
+import subprocess
+from .SDAHelper import *
+
+def handler(event):
+
+    benchmark_bucket = event["benchmark_bucket"]
+    filter_output_bucket = event["filter_output_bucket"]
+    TMP_DIR = create_tmp_dir()
+    input_files = [download_shp_file(benchmark_bucket, filter_output_bucket ,shp_file, TMP_DIR ) for shp_file in event["cluster_input_files"]]
+    config = load_config(event, TMP_DIR)
+    components = event["cluster_components"]
+    OUTPUT_STEM = "Cluster"+str(components[0])
+    # Store workflow data in /tmp due to read only filesystem restriction
+    command = [f"SettlementDelineationContraction", "-i"] 
+    command.extend([str(file) for file in input_files])
+    command.extend(["-c", str(config),"--outputStem",OUTPUT_STEM,"--components"])
+    command.extend([str(comp) for comp in components])
+    result = subprocess.run(command,capture_output=True,text=True, cwd=TMP_DIR)
+    if result.returncode != 0:
+        event["stdout"] = result.stdout
+        event["stderr"] = result.stderr
+        return event
+    output_file = Path(TMP_DIR).glob(f"{OUTPUT_STEM}*.shp").__next__()
+    upload_shp_file(benchmark_bucket, event["cluster_output_bucket"],output_file)
+    event.pop("cluster_input_files", None)
+    event.pop("cluster_components", None)
+    event["cluster_output_file"] = output_file.name
+    return {"payload":event,"request_id":event.get("request-id","0")}