s-heppner · s-heppner · Mar 7, 2025 · Mar 6, 2025 · Mar 7, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,31 +1,63 @@
-name: test
+name: ci
 
-on:
-  push:
-    branches:
-      - '**'
+on: [push, pull_request]
+
+
+env:
+  X_PYTHON_VERSION: "3.11"
 
 jobs:
   build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os:
-          - ubuntu-latest
-          - windows-latest
+    # This job checks if the build succeeds
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ env.X_PYTHON_VERSION }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.X_PYTHON_VERSION }}
 
+      - name: Build the package
+        run: pip install .
+
+  test:
+    # This job runs the unittests
+    runs-on: ubuntu-latest
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Set up Python
+      - name: Set up Python ${{ env.X_PYTHON_VERSION }}
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
-          architecture: x64
+          python-version: ${{ env.X_PYTHON_VERSION }}
 
       - name: Install Python dependencies
-        run: pip install -r requirements.txt
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[dev]
 
       - name: Run Python Tests
         run: python -m unittest discover
+
+  static-analysis:
+    # This job runs static code analysis, namely pycodestyle and mypy
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.X_PYTHON_VERSION }}
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[dev]
+      - name: Check typing with MyPy
+        run: |
+          mypy semantic_matcher test
+      - name: Check code style with PyCodestyle
+        run: |
+          pycodestyle --count --max-line-length 120 semantic_matcher test
diff --git a/config.ini.default b/config.ini.default
@@ -2,7 +2,7 @@
 endpoint=http://127.0.0.1
 LISTEN_ADDRESS=127.0.0.1
 port=8000
-equivalence_table_file=./resources/equivalence_table.json
+match_graph_file=./resources/example_graph.json
 
 [RESOLVER]
 endpoint=http://semantic_id_resolver

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,16 @@ dependencies = [
     "pydantic>=1.10",
     "uvicorn>=0.21.1",
     "requests>=2.31.0",
+    "networkx>=3.4.2",
+]
+
+[project.optional-dependencies]
+dev = [
+    "mypy",
+    "pycodestyle",
+    "coverage",
+    "types-networkx",  # Type hints for networkx
+    "types-requests",   # Type hints for requests
 ]
 
 [tool.setuptools]

diff --git a/resources/equivalence_table.json b/resources/equivalence_table.json
diff --git a/resources/example_graph.json b/resources/example_graph.json
@@ -0,0 +1,26 @@
+[
+    {
+        "base_semantic_id": "A",
+        "match_semantic_id": "B",
+        "score": 0.8,
+        "path": []
+    },
+    {
+        "base_semantic_id": "B",
+        "match_semantic_id": "C",
+        "score": 0.7,
+        "path": []
+    },
+    {
+        "base_semantic_id": "B",
+        "match_semantic_id": "D",
+        "score": 0.6,
+        "path": []
+    },
+    {
+        "base_semantic_id": "C",
+        "match_semantic_id": "D",
+        "score": 0.9,
+        "path": []
+    }
+]
diff --git a/semantic_matcher/algorithm.py b/semantic_matcher/algorithm.py
@@ -0,0 +1,147 @@
+import json
+from typing import List, Tuple
+import heapq
+
+import networkx as nx
+from pydantic import BaseModel
+
+
+class SemanticMatchGraph(nx.DiGraph):
+    def __init__(self):
+        super().__init__()
+
+    def add_semantic_match(self,
+                           base_semantic_id: str,
+                           match_semantic_id: str,
+                           score: float):
+        self.add_edge(
+            u_of_edge=base_semantic_id,
+            v_of_edge=match_semantic_id,
+            weight=score,
+        )
+
+    def get_all_matches(self) -> List["SemanticMatch"]:
+        matches: List["SemanticMatch"] = []
+
+        # Iterate over all edges in the graph
+        for base, match, data in self.edges(data=True):
+            score = data.get("weight", 0.0)  # Get weight, default to 0.0 if missing
+            matches.append(SemanticMatch(
+                base_semantic_id=base,
+                match_semantic_id=match,
+                score=score,
+                path=[]  # Direct match, no intermediate nodes
+            ))
+
+        return matches
+
+    def to_file(self, filename: str):
+        with open(filename, "w") as file:
+            matches = [match.model_dump() for match in self.get_all_matches()]
+            json.dump(matches, file, indent=4)
+
+    @classmethod
+    def from_file(cls, filename: str) -> "SemanticMatchGraph":
+        with open(filename, "r") as file:
+            matches_data = json.load(file)
+        graph = SemanticMatchGraph()
+        for match_data in matches_data:
+            graph.add_semantic_match(
+                base_semantic_id=match_data["base_semantic_id"],
+                match_semantic_id=match_data["match_semantic_id"],
+                score=match_data["score"]
+            )
+        return graph
+
+
+class SemanticMatch(BaseModel):
+    base_semantic_id: str
+    match_semantic_id: str
+    score: float
+    path: List[str]  # The path of `semantic_id`s that the algorithm took
+
+    def __str__(self) -> str:
+        return f"{' -> '.join(self.path + [self.match_semantic_id])} = {self.score}"
+
+    def __hash__(self):
+        return hash((
+            self.base_semantic_id,
+            self.match_semantic_id,
+            self.score,
+            tuple(self.path),
+        ))
+
+
+def find_semantic_matches(
+    graph: SemanticMatchGraph,
+    semantic_id: str,
+    min_score: float = 0.5
+) -> List[SemanticMatch]:
+    """
+    Find semantic matches for a given node with a minimum score threshold.
+
+    Args:
+        graph (nx.DiGraph): The directed graph with weighted edges.
+        semantic_id (str): The starting semantic_id.
+        min_score (float): The minimum similarity score to consider.
+            This value is necessary to ensure the search terminates also with sufficiently large graphs.
+
+    Returns:
+        List[SemanticMatch]:
+        A list of MatchResults, sorted by their score with the highest score first.
+    """
+    if semantic_id not in graph:
+        return []
+
+    # We need to make sure that all possible paths starting from the given semantic_id are explored.
+    # To achieve this, we use the concept of "priority queue". While we could use a simple FIFO list of matches to
+    # explore, this way we actually end up with an already sorted result with the highest match at the beginning of the
+    # list. As possible implementation of this abstract data structure, we choose to use a "max-heap".
+    # However, there is no efficient implementation of a max-heap in Python, so rather we use the built-in "min-heap"
+    # and negate the score values. A priority queue ensures that elements with the highest priority are processed first,
+    # regardless of when they were added.
+    # We initialize the priority queue:
+    pq: List[Tuple[float, str, List[str]]] = [(-1.0, semantic_id, [])]  # (neg_score, node, path)
+    # The queue is structured as follows:
+    #   - `neg_score`: The negative score of the match
+    #   - `node`: The `match_semantic_id` of the match
+    #   - `path`: The path between the `semantic_id` and the `match_semantic_id`
+
+    # Prepare the result list
+    results: List[SemanticMatch] = []
+
+    # Run the priority queue until all possible paths have been explored
+    # This means in each iteration:
+    #   - We pop the top element of the queue as it's the next highest semantic match we want to explore
+    #   - If the match has a score higher or equal to the given `min_score`, we add it to the results
+    #   - We add all connected `semantic_id`s to the priority queue to be treated next
+    #   - We go to the next element of the queue
+    while pq:
+        # Get the highest-score match from the queue
+        neg_score, node, path = heapq.heappop(pq)
+        score = -neg_score  # Convert back to positive
+
+        # Store result if above threshold (except the start node)
+        if node != semantic_id and score >= min_score:
+            results.append(SemanticMatch(
+                base_semantic_id=semantic_id,
+                match_semantic_id=node,
+                score=score,
+                path=path
+            ))
+
+        # Traverse to the neighboring and therefore connected `semantic_id`s
+        for neighbor, edge_data in graph[node].items():
+            edge_weight = edge_data["weight"]
+            assert isinstance(edge_weight, float)
+            new_score: float = score * edge_weight  # Multiplicative propagation
+
+            # Prevent loops by ensuring we do not revisit the start node after the first iteration
+            if neighbor == semantic_id:
+                continue  # Avoid re-exploring the start node
+
+            # We add the newly found `semantic_id`s to the queue to be explored next in order of their score
+            if new_score >= min_score:
+                heapq.heappush(pq, (-new_score, neighbor, path + [node]))  # Push updated path
+
+    return results
diff --git a/semantic_matcher/examples/simple_example_equivalence_table.py b/semantic_matcher/examples/simple_example_equivalence_table.py