From f7c4a6c8ec296a3057dec2d8b2ad64f99cc31af2 Mon Sep 17 00:00:00 2001 From: s-heppner Date: Thu, 6 Mar 2025 23:47:09 +0100 Subject: [PATCH 1/2] Refactor the model and matching algorithm Currently, we can only effectively use the `SemanticMatch` and `EquivalenceTable` classes with the server. This structure was not very effective for testing. We refactor away from using the model and instead add a new `algorithm` module. Semantic matches are now stored in a `networkx.DiGraph`, a directed graph. The algorithm is performed on the graph with effective graph operations, making the whole process more efficient as well. Furthermore, we clean up the service to use it in a more pythonic way, eliminating the need for the `service_model` module. Note, that this is a major refactor and solves multiple problems: Fixes #1 Fixes #5 --- .github/workflows/ci.yml | 62 ++- config.ini.default | 2 +- pyproject.toml | 8 + resources/equivalence_table.json | 32 -- resources/example_graph.json | 26 ++ semantic_matcher/algorithm.py | 145 +++++++ semantic_matcher/examples/__init__.py | 0 .../simple_example_equivalence_table.py | 38 -- semantic_matcher/model.py | 69 ---- semantic_matcher/service.py | 122 +++--- semantic_matcher/service_model.py | 25 -- semantic_matcher/visualization.py | 33 ++ test/test_algorithm.py | 356 ++++++++++++++++++ test/test_resources/example_graph.json | 20 + test/test_semantic_matcher.py | 246 +++--------- test_resources/config.ini | 8 - test_resources/equivalence_table.json | 42 --- 17 files changed, 771 insertions(+), 463 deletions(-) delete mode 100644 resources/equivalence_table.json create mode 100644 resources/example_graph.json create mode 100644 semantic_matcher/algorithm.py delete mode 100644 semantic_matcher/examples/__init__.py delete mode 100644 semantic_matcher/examples/simple_example_equivalence_table.py delete mode 100644 semantic_matcher/model.py delete mode 100644 semantic_matcher/service_model.py create mode 100644 semantic_matcher/visualization.py create mode 100644 test/test_algorithm.py create mode 100644 test/test_resources/example_graph.json delete mode 100644 test_resources/config.ini delete mode 100644 test_resources/equivalence_table.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0ec9a6..63600fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,31 +1,63 @@ -name: test +name: ci -on: - push: - branches: - - '**' +on: [push, pull_request] + + +env: + X_PYTHON_VERSION: "3.11" jobs: build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: - - ubuntu-latest - - windows-latest + # This job checks if the build succeeds + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ env.X_PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.X_PYTHON_VERSION }} + - name: Build the package + run: pip install . + + test: + # This job runs the unittests + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - - name: Set up Python + - name: Set up Python ${{ env.X_PYTHON_VERSION }} uses: actions/setup-python@v4 with: - python-version: "3.10" - architecture: x64 + python-version: ${{ env.X_PYTHON_VERSION }} - name: Install Python dependencies - run: pip install -r requirements.txt + run: | + python -m pip install --upgrade pip + pip install .[dev] - name: Run Python Tests run: python -m unittest discover + + static-analysis: + # This job runs static code analysis, namely pycodestyle and mypy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: ${{ env.X_PYTHON_VERSION }} + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install .[dev] + - name: Check typing with MyPy + run: | + mypy semantic_matcher test + - name: Check code style with PyCodestyle + run: | + pycodestyle --count --max-line-length 120 semantic_matcher test diff --git a/config.ini.default b/config.ini.default index 10c47b2..e75e636 100644 --- a/config.ini.default +++ b/config.ini.default @@ -2,7 +2,7 @@ endpoint=http://127.0.0.1 LISTEN_ADDRESS=127.0.0.1 port=8000 -equivalence_table_file=./resources/equivalence_table.json +match_graph_file=./resources/example_graph.json [RESOLVER] endpoint=http://semantic_id_resolver diff --git a/pyproject.toml b/pyproject.toml index a179c6c..329857e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,14 @@ dependencies = [ "pydantic>=1.10", "uvicorn>=0.21.1", "requests>=2.31.0", + "networkx>=3.4.2", +] + +[project.optional-dependencies] +dev = [ + "mypy", + "pycodestyle", + "coverage", ] [tool.setuptools] diff --git a/resources/equivalence_table.json b/resources/equivalence_table.json deleted file mode 100644 index fe95165..0000000 --- a/resources/equivalence_table.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "matches": { - "s-heppner.com/semanticID/one": [ - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/1", - "score": 1.0, - "meta_information": { - "matchSource": "Defined by Sebastian Heppner" - } - }, - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/two", - "score": 0.8, - "meta_information": { - "matchSource": "Defined by Sebastian Heppner" - } - } - ], - "s-heppner.com/semanticID/two": [ - { - "base_semantic_id": "s-heppner.com/semanticID/two", - "match_semantic_id": "s-heppner.com/semanticID/2", - "score": 1.0, - "meta_information": { - "matchSource": "Defined by Sebastian Heppner" - } - } - ] - } -} \ No newline at end of file diff --git a/resources/example_graph.json b/resources/example_graph.json new file mode 100644 index 0000000..c24de3c --- /dev/null +++ b/resources/example_graph.json @@ -0,0 +1,26 @@ +[ + { + "base_semantic_id": "A", + "match_semantic_id": "B", + "score": 0.8, + "path": [] + }, + { + "base_semantic_id": "B", + "match_semantic_id": "C", + "score": 0.7, + "path": [] + }, + { + "base_semantic_id": "B", + "match_semantic_id": "D", + "score": 0.6, + "path": [] + }, + { + "base_semantic_id": "C", + "match_semantic_id": "D", + "score": 0.9, + "path": [] + } +] \ No newline at end of file diff --git a/semantic_matcher/algorithm.py b/semantic_matcher/algorithm.py new file mode 100644 index 0000000..887db67 --- /dev/null +++ b/semantic_matcher/algorithm.py @@ -0,0 +1,145 @@ +import json +from typing import List, Tuple +import heapq + +import networkx as nx +from pydantic import BaseModel + + +class SemanticMatchGraph(nx.DiGraph): + def __init__(self): + super().__init__() + + def add_semantic_match(self, + base_semantic_id: str, + match_semantic_id: str, + score: float): + self.add_edge( + u_of_edge=base_semantic_id, + v_of_edge=match_semantic_id, + weight=score, + ) + + def get_all_matches(self) -> List["SemanticMatch"]: + matches: List["SemanticMatch"] = [] + + # Iterate over all edges in the graph + for base, match, data in self.edges(data=True): + score = data.get("weight", 0.0) # Get weight, default to 0.0 if missing + matches.append(SemanticMatch( + base_semantic_id=base, + match_semantic_id=match, + score=score, + path=[] # Direct match, no intermediate nodes + )) + + return matches + + def to_file(self, filename: str): + with open(filename, "w") as file: + matches = [match.model_dump() for match in self.get_all_matches()] + json.dump(matches, file, indent=4) + + @classmethod + def from_file(cls, filename: str) -> "SemanticMatchGraph": + with open(filename, "r") as file: + matches_data = json.load(file) + graph = SemanticMatchGraph() + for match_data in matches_data: + graph.add_semantic_match( + base_semantic_id=match_data["base_semantic_id"], + match_semantic_id=match_data["match_semantic_id"], + score=match_data["score"] + ) + return graph + + +class SemanticMatch(BaseModel): + base_semantic_id: str + match_semantic_id: str + score: float + path: List[str] # The path of `semantic_id`s that the algorithm took + + def __str__(self) -> str: + return f"{' -> '.join(self.path + [self.match_semantic_id])} = {self.score}" + + def __hash__(self): + return hash(( + self.base_semantic_id, + self.match_semantic_id, + self.score, + tuple(self.path), + )) + + +def find_semantic_matches( + graph: SemanticMatchGraph, + semantic_id: str, + min_score: float = 0.5 +) -> List[SemanticMatch]: + """ + Find semantic matches for a given node with a minimum score threshold. + + Args: + graph (nx.DiGraph): The directed graph with weighted edges. + semantic_id (str): The starting semantic_id. + min_score (float): The minimum similarity score to consider. + This value is necessary to ensure the search terminates also with sufficiently large graphs. + + Returns: + List[SemanticMatch]: + A list of MatchResults, sorted by their score with the highest score first. + """ + if semantic_id not in graph: + return [] + + # We need to make sure that all possible paths starting from the given semantic_id are explored. + # To achieve this, we use the concept of "priority queue". While we could use a simple FIFO list of matches to + # explore, this way we actually end up with an already sorted result with the highest match at the beginning of the + # list. As possible implementation of this abstract data structure, we choose to use a "max-heap". + # However, there is no efficient implementation of a max-heap in Python, so rather we use the built-in "min-heap" + # and negate the score values. A priority queue ensures that elements with the highest priority are processed first, + # regardless of when they were added. + # We initialize the priority queue: + pq: List[Tuple[float, str, List[str]]] = [(-1.0, semantic_id, [])] # (neg_score, node, path) + # The queue is structured as follows: + # - `neg_score`: The negative score of the match + # - `node`: The `match_semantic_id` of the match + # - `path`: The path between the `semantic_id` and the `match_semantic_id` + + # Prepare the result list + results: List[SemanticMatch] = [] + + # Run the priority queue until all possible paths have been explored + # This means in each iteration: + # - We pop the top element of the queue as it's the next highest semantic match we want to explore + # - If the match has a score higher or equal to the given `min_score`, we add it to the results + # - We add all connected `semantic_id`s to the priority queue to be treated next + # - We go to the next element of the queue + while pq: + # Get the highest-score match from the queue + neg_score, node, path = heapq.heappop(pq) + score = -neg_score # Convert back to positive + + # Store result if above threshold (except the start node) + if node != semantic_id and score >= min_score: + results.append(SemanticMatch( + base_semantic_id=semantic_id, + match_semantic_id=node, + score=score, + path=path + )) + + # Traverse to the neighboring and therefore connected `semantic_id`s + for neighbor, edge_data in graph[node].items(): + new_score: float = score * edge_data["weight"] # Multiplicative propagation + + # Prevent loops by ensuring we do not revisit the start node after the first iteration + if neighbor == semantic_id: + continue # Avoid re-exploring the start node + + # We add the newly found `semantic_id`s to the queue to be explored next in order of their score + if new_score >= min_score: + heapq.heappush(pq, (-new_score, neighbor, path + [node])) # Push updated path + + return results diff --git a/semantic_matcher/examples/__init__.py b/semantic_matcher/examples/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/semantic_matcher/examples/simple_example_equivalence_table.py b/semantic_matcher/examples/simple_example_equivalence_table.py deleted file mode 100644 index f2ce282..0000000 --- a/semantic_matcher/examples/simple_example_equivalence_table.py +++ /dev/null @@ -1,38 +0,0 @@ -from semantic_matcher.model import SemanticMatch, EquivalenceTable - - -def return_simple_example_equivalence_table() -> EquivalenceTable: - """ - Returns a simple equivalence table with three semantic matches - """ - table = EquivalenceTable(matches={}) - table.add_semantic_match( - SemanticMatch( - base_semantic_id="s-heppner.com/semanticID/one", - match_semantic_id="s-heppner.com/semanticID/1", - score=1., - meta_information={"matchSource": "Defined by Sebastian Heppner"} - ) - ) - table.add_semantic_match( - SemanticMatch( - base_semantic_id="s-heppner.com/semanticID/two", - match_semantic_id="s-heppner.com/semanticID/2", - score=1., - meta_information={"matchSource": "Defined by Sebastian Heppner"} - ) - ) - table.add_semantic_match( - SemanticMatch( - base_semantic_id="s-heppner.com/semanticID/one", - match_semantic_id="s-heppner.com/semanticID/two", - score=0.8, - meta_information={"matchSource": "Defined by Sebastian Heppner"} - ) - ) - return table - - -if __name__ == '__main__': - e = return_simple_example_equivalence_table() - e.to_file("example_equivalence_table.json") diff --git a/semantic_matcher/model.py b/semantic_matcher/model.py deleted file mode 100644 index c95701a..0000000 --- a/semantic_matcher/model.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import Dict, List - -from pydantic import BaseModel - - -class SemanticMatch(BaseModel): - """ - A semantic match, mapping two semanticIDs with a matching score. Can be imagined as a weighted graph with - `base_semantic_id` ---`score`---> `match_semantic_id` - - Todo: Think about static and TTL, but that is optimization - Todo: Maybe we want to have the matching method as debug information - """ - base_semantic_id: str - match_semantic_id: str - score: float - meta_information: Dict - - -class EquivalenceTable(BaseModel): - matches: Dict[str, List[SemanticMatch]] - - def add_semantic_match(self, match: SemanticMatch) -> None: - if self.matches.get(match.base_semantic_id) is not None: - if match not in self.matches[match.base_semantic_id]: - self.matches[match.base_semantic_id].append(match) - else: - self.matches[match.base_semantic_id] = [match] - - def remove_semantic_match(self, match: SemanticMatch) -> None: - if self.matches.get(match.base_semantic_id) is not None: - self.matches.get(match.base_semantic_id).remove(match) - if len(self.matches.get(match.base_semantic_id)) == 0: - self.matches.pop(match.base_semantic_id) - - def remove_all_semantic_matches(self): - self.matches.clear() - - def get_local_matches(self, semantic_id: str, score_limit: float) -> List[SemanticMatch]: - equivalence_table_result = self.matches.get(semantic_id) - if equivalence_table_result is None: - return [] - matching_result = [] - for match in equivalence_table_result: - if match.score > score_limit: - matching_result.append(match) - rec_result = self.get_local_matches(match.match_semantic_id, score_limit/match.score) - for rec_match in rec_result: - rec_match.base_semantic_id = match.base_semantic_id - rec_match.score *= match.score - if "path" not in rec_match.meta_information: - rec_match.meta_information["path"] = [] - rec_match.meta_information["path"].insert(0, match.match_semantic_id) - if rec_result is not None: - matching_result += rec_result - return matching_result - - def get_all_matches(self) -> List[SemanticMatch]: - return self.matches - - def to_file(self, filename: str) -> None: - with open(filename, "w") as file: - file.write(self.model_dump_json(indent=4)) - - @classmethod - def from_file(cls, filename: str) -> "EquivalenceTable": - with open(filename, "r") as file: - return EquivalenceTable.model_validate_json(file.read()) - diff --git a/semantic_matcher/service.py b/semantic_matcher/service.py index 0b752fe..d72927b 100644 --- a/semantic_matcher/service.py +++ b/semantic_matcher/service.py @@ -1,9 +1,25 @@ -from typing import List +from typing import Optional, List, Set +from pydantic import BaseModel import requests -from fastapi import APIRouter +from fastapi import APIRouter, Response -from semantic_matcher import model, service_model +from semantic_matcher import algorithm + + +class MatchRequest(BaseModel): + """ + Request body for the :func:`service.SemanticMatchingService.get_match` + + :ivar semantic_id: The semantic ID that we want to find matches for + :ivar score_limit: The minimum semantic similarity score to look for. Is considered as larger or equal (>=) + :ivar local_only: If `True`, only check at the local service and do not request other services + :ivar already_checked_locations: Optional Set of already checked semantic matching services to avoid looping + """ + semantic_id: str + score_limit: float + local_only: bool = True + already_checked_locations: Optional[Set[str]] = None class SemanticMatchingService: @@ -27,7 +43,7 @@ class SemanticMatchingService: def __init__( self, endpoint: str, - equivalences: model.EquivalenceTable + graph: algorithm.SemanticMatchGraph ): """ Initializer of :class:`~.SemanticMatchingService` @@ -46,6 +62,7 @@ def __init__( self.router.add_api_route( "/get_matches", self.get_matches, + response_model=List[algorithm.SemanticMatch], methods=["GET"] ) self.router.add_api_route( @@ -53,82 +70,96 @@ def __init__( self.post_matches, methods=["POST"] ) - self.router.add_api_route( - "/clear", - self.remove_all_matches, - methods=["POST"] - ) self.endpoint: str = endpoint - self.equivalence_table: model.EquivalenceTable = equivalences + self.graph: algorithm.SemanticMatchGraph = graph def get_all_matches(self): """ Returns all matches stored in the equivalence table- """ - matches = self.equivalence_table.get_all_matches() + matches = self.graph.get_all_matches() return matches - def remove_all_matches(self): - self.equivalence_table.remove_all_semantic_matches() def get_matches( self, - request_body: service_model.MatchRequest - ) -> service_model.MatchesList: + request_body: MatchRequest + ) -> List[algorithm.SemanticMatch]: """ A query to match two SubmodelElements semantically. Returns a matching score """ # Try first local matching - matches: List[model.SemanticMatch] = self.equivalence_table.get_local_matches( + matches: List[algorithm.SemanticMatch] = algorithm.find_semantic_matches( + graph=self.graph, semantic_id=request_body.semantic_id, - score_limit=request_body.score_limit + min_score=request_body.score_limit ) # If the request asks us to only locally look, we're done already if request_body.local_only: - return service_model.MatchesList(matches=matches) + return matches # Now look for remote matches: - additional_remote_matches: List[model.SemanticMatch] = [] + additional_remote_matches: List[algorithm.SemanticMatch] = [] for match in matches: + # If the `match_semantic_id` has the same namespace as the `base_semantic_id` there is no sense in looking + # further, since the semantic_id Resolver would return this Semantic Matching Service. if match.base_semantic_id.split("/")[0] == match.match_semantic_id.split("/")[0]: - #match_id is local - continue + continue # Todo: We definitely need to check for namespace, this just takes "https:" + + # We need to make sure we do not go to the same Semantic Matching Service twice. + # For that we update the already_checked_locations with the current endpoint: + already_checked_locations: Set[str] = {self.endpoint} + if request_body.already_checked_locations: + already_checked_locations.update(request_body.already_checked_locations) + remote_matching_service = self._get_matcher_from_semantic_id(match.match_semantic_id) - if remote_matching_service is None: + # If we could not find the remote_matching_service, or we already checked it, we continue + if remote_matching_service is None or remote_matching_service in already_checked_locations: continue - remote_matching_request = service_model.MatchRequest( + # Todo: There is an edge case where this would not find all matches: + # Imagine we have a situation, where A -> B -> C -> D, but A and C are on SMS1 and B and C are + # on SMS2. This would not find the match C, since we already checked SMS1 + # I guess this is fine for the moment though. + + # This makes it possible to create the match request: + remote_matching_request = MatchRequest( semantic_id=match.match_semantic_id, - # This is a simple "Ungleichung" - # Unified score is multiplied: score(A->B) * score(B->C) - # This score should be larger or equal than the requested score_limit: - # score(A->B) * score(B->C) >= score_limit - # score(A->B) is well known, as it is the `match.score` - # => score(B->C) >= (score_limit/score(A->B)) + # This is a simple inequality equation: + # Unified score is multiplied: score(A->B) * score(B->C) + # This score should be larger or equal than the requested score_limit: + # score(A->B) * score(B->C) >= score_limit + # score(A->B) is well known, as it is the `match.score` + # => score(B->C) >= (score_limit/score(A->B)) score_limit=float(request_body.score_limit/match.score), # If we already request a remote score, it does not make sense to choose `local_only` local_only=False, name=request_body.name, - definition=request_body.definition + definition=request_body.definition, + already_checked_locations=already_checked_locations ) url = f"{remote_matching_service}/get_matches" new_matches_response = requests.get(url, json=remote_matching_request.model_dump_json()) - match_response = service_model.MatchesList.model_validate_json(new_matches_response.text) - additional_remote_matches.extend(match_response.matches) + response_matches = [algorithm.SemanticMatch(**match) for match in new_matches_response.json()] + additional_remote_matches.extend(response_matches) # Finally, put all matches together and return matches.extend(additional_remote_matches) - res = service_model.MatchesList(matches=matches) - return res + return matches def post_matches( self, - request_body: service_model.MatchesList - ) -> None: - for match in request_body.matches: - self.equivalence_table.add_semantic_match(match) - # Todo: Figure out how to properly return 200 + request_body: List[algorithm.SemanticMatch] + ) -> Response: + for match in request_body: + self.graph.add_semantic_match( + base_semantic_id=match.base_semantic_id, + match_semantic_id=match.match_semantic_id, + score=match.score, + ) + return Response(status_code=200) - def _get_matcher_from_semantic_id(self, semantic_id: str) -> str: + @staticmethod + def _get_matcher_from_semantic_id(semantic_id: str) -> Optional[str]: """ Finds the suiting `SemanticMatchingService` for the given `semantic_id`. @@ -162,19 +193,18 @@ def _get_matcher_from_semantic_id(self, semantic_id: str) -> str: os.path.abspath(os.path.join(os.path.dirname(__file__), "../config.ini")), ]) - # Read in equivalence table - # Note, this construct takes the path in the config.ini relative to the - # location of the config.ini - EQUIVALENCES = model.EquivalenceTable.from_file( + # Read in `SemanticMatchGraph`. + # Note, this construct takes the path in the config.ini relative to the location of the config.ini + match_graph = algorithm.SemanticMatchGraph.from_file( filename=os.path.abspath(os.path.join( os.path.dirname(__file__), "..", - config["SERVICE"]["equivalence_table_file"] + config["SERVICE"]["match_graph_file"] )) ) SEMANTIC_MATCHING_SERVICE = SemanticMatchingService( endpoint=config["SERVICE"]["endpoint"], - equivalences=EQUIVALENCES + graph=match_graph, ) APP = FastAPI() APP.include_router( diff --git a/semantic_matcher/service_model.py b/semantic_matcher/service_model.py deleted file mode 100644 index 64d2a1a..0000000 --- a/semantic_matcher/service_model.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import Optional, List - -from pydantic import BaseModel - -from semantic_matcher import model - - -class MatchRequest(BaseModel): - """ - Request body for the :func:`service.SemanticMatchingService.get_match` - - :ivar semantic_id: The semantic ID that we want to find matches for - :ivar local_only: If `True`, only check at the local service and do not request other services - :ivar name: Optional name of the resolved semantic ID for NLP matching - :ivar definition: Optional definition of the resolved semantic ID for NLP matching - """ - semantic_id: str - score_limit: float - local_only: bool = True - name: Optional[str] = None - definition: Optional[str] = None - - -class MatchesList(BaseModel): - matches: List[model.SemanticMatch] diff --git a/semantic_matcher/visualization.py b/semantic_matcher/visualization.py new file mode 100644 index 0000000..c2aac6d --- /dev/null +++ b/semantic_matcher/visualization.py @@ -0,0 +1,33 @@ +import matplotlib.pyplot as plt +import networkx as nx + +from semantic_matcher.algorithm import SemanticMatchGraph + +# Todo: This is WIP + + +def save_graph_as_figure(g: SemanticMatchGraph, filename: str) -> None: + """ + A simple visualization of a `SemanticMatchGraph` saved as picture + """ + # Draw the graph + plt.figure() + pos = nx.spring_layout(g) # Positions for nodes + nx.draw(g, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=3000, font_size=10) + + # Add edge labels + edge_labels = {(u, v): f"{d['weight']:.2f}" for u, v, d in g.edges(data=True)} + nx.draw_networkx_edge_labels(g, pos, edge_labels=edge_labels) + + plt.savefig(filename) + + +if __name__ == "__main__": + graph_complex = SemanticMatchGraph() + graph_complex.add_edge("A", "B", weight=0.9, source="dataset1") + graph_complex.add_edge("A", "C", weight=0.8, source="dataset2") + graph_complex.add_edge("B", "D", weight=0.7, source="dataset3") + graph_complex.add_edge("C", "D", weight=0.6, source="dataset4") + graph_complex.add_edge("D", "E", weight=0.5, source="dataset5") + + save_graph_as_figure(graph_complex, "temp.png") diff --git a/test/test_algorithm.py b/test/test_algorithm.py new file mode 100644 index 0000000..da9f911 --- /dev/null +++ b/test/test_algorithm.py @@ -0,0 +1,356 @@ +import unittest +from typing import List +import os +import json + +from semantic_matcher import algorithm + + +class TestSemanticMatchGraph(unittest.TestCase): + TEST_FILE = "test_graph.json" + + def setUp(self): + """Set up a test graph before each test.""" + self.graph = algorithm.SemanticMatchGraph() + self.graph.add_semantic_match("A", "B", 0.8) + self.graph.add_semantic_match("B", "C", 0.6) + self.graph.add_semantic_match("C", "D", 0.9) + + def tearDown(self): + """Remove the test file after each test.""" + if os.path.exists(self.TEST_FILE): + os.remove(self.TEST_FILE) + + def test_get_all_matches_basic(self): + """Test that all direct semantic matches are returned correctly.""" + matches = self.graph.get_all_matches() + + expected_matches = [ + algorithm.SemanticMatch(base_semantic_id="A", match_semantic_id="B", score=0.8, path=[]), + algorithm.SemanticMatch(base_semantic_id="B", match_semantic_id="C", score=0.6, path=[]), + algorithm.SemanticMatch(base_semantic_id="C", match_semantic_id="D", score=0.9, path=[]), + ] + + self.assertEqual(len(matches), 3, "Incorrect number of matches retrieved.") + self.assertCountEqual(expected_matches, matches, "Matches do not match expected results.") + + def test_get_all_matches_empty_graph(self): + """Test that an empty graph returns an empty list.""" + empty_graph = algorithm.SemanticMatchGraph() + matches = empty_graph.get_all_matches() + self.assertEqual([], matches,"Empty graph should return an empty list.") + + def test_get_all_matches_duplicate_edges(self): + """Test handling of duplicate edges with different scores.""" + self.graph.add_semantic_match("A", "B", 0.9) # Overwriting edge + matches = self.graph.get_all_matches() + + expected_matches = [ + algorithm.SemanticMatch(base_semantic_id="A", match_semantic_id="B", score=0.9, path=[]), # Overwritten edge + algorithm.SemanticMatch(base_semantic_id="B", match_semantic_id="C", score=0.6, path=[]), + algorithm.SemanticMatch(base_semantic_id="C", match_semantic_id="D", score=0.9, path=[]), + ] + + self.assertEqual(len(matches), 3, "Duplicate edge handling failed.") + self.assertCountEqual(expected_matches, matches,"Matches do not match expected results.") + + def test_get_all_matches_varying_weights(self): + """Test that matches with different weights are retrieved correctly.""" + self.graph.add_semantic_match("D", "E", 0.3) + self.graph.add_semantic_match("E", "F", 1.0) + matches = self.graph.get_all_matches() + + expected_matches = [ + algorithm.SemanticMatch(base_semantic_id="A", match_semantic_id="B", score=0.8, path=[]), + algorithm.SemanticMatch(base_semantic_id="B", match_semantic_id="C", score=0.6, path=[]), + algorithm.SemanticMatch(base_semantic_id="C", match_semantic_id="D", score=0.9, path=[]), + algorithm.SemanticMatch(base_semantic_id="D", match_semantic_id="E", score=0.3, path=[]), + algorithm.SemanticMatch(base_semantic_id="E", match_semantic_id="F", score=1.0, path=[]), + ] + + self.assertEqual(len(matches), 5, "Incorrect number of matches retrieved.") + self.assertCountEqual(expected_matches, matches, "Matches do not match expected results.") + + def test_to_file(self): + """Test that the graph is correctly saved to a file.""" + self.graph.to_file(self.TEST_FILE) + + # Check if file exists + self.assertTrue(os.path.exists(self.TEST_FILE), "File was not created.") + + # Load file content and verify JSON structure + with open(self.TEST_FILE, "r") as file: + data = json.load(file) + + self.assertIsInstance(data, list, "File content should be a list of matches.") + self.assertEqual(len(data), 3, "Incorrect number of matches stored in file.") + + expected_data = [ + {"base_semantic_id": "A", "match_semantic_id": "B", "score": 0.8, "path": []}, + {"base_semantic_id": "B", "match_semantic_id": "C", "score": 0.6, "path": []}, + {"base_semantic_id": "C", "match_semantic_id": "D", "score": 0.9, "path": []}, + ] + + self.assertEqual(data, expected_data, "File content does not match expected data.") + + def test_from_file(self): + """Test that a graph can be correctly loaded from a file.""" + self.graph.to_file(self.TEST_FILE) + loaded_graph = algorithm.SemanticMatchGraph.from_file(self.TEST_FILE) + + # Check if the loaded graph has the same edges and weights + self.assertEqual(len(loaded_graph.edges()), 3, "Loaded graph has incorrect number of edges.") + + for u, v, data in self.graph.edges(data=True): + self.assertTrue(loaded_graph.has_edge(u, v), f"Edge {u} -> {v} is missing in loaded graph.") + self.assertEqual(loaded_graph[u][v]["weight"], data["weight"], f"Edge weight mismatch for {u} -> {v}") + + def test_empty_graph(self): + """Test saving and loading an empty graph.""" + empty_graph = algorithm.SemanticMatchGraph() + empty_graph.to_file(self.TEST_FILE) + loaded_graph = algorithm.SemanticMatchGraph.from_file(self.TEST_FILE) + + self.assertEqual(len(loaded_graph.edges()), 0, "Loaded graph should be empty.") + + +class TestSemanticMatch(unittest.TestCase): + def test_str_representation(self): + """Test that __str__ correctly formats the path and score.""" + match = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + + expected_str = "A -> B = 0.8" + self.assertEqual(expected_str, str(match), "__str__ method output is incorrect") + + def test_str_representation_longer_path(self): + """Test __str__ output with a longer path.""" + match = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="D", + score=0.6, + path=["A", "B", "C"] + ) + + expected_str = "A -> B -> C -> D = 0.6" + self.assertEqual(expected_str, str(match), "__str__ method output is incorrect for longer paths") + + def test_str_representation_no_path(self): + """Test __str__ output when there's no path (direct match).""" + match = algorithm.SemanticMatch( + base_semantic_id="X", + match_semantic_id="Y", + score=1.0, + path=[] + ) + + expected_str = "Y = 1.0" # No path, just the match + self.assertEqual(expected_str, str(match), "__str__ method output is incorrect for empty path") + + def test_hash_consistency(self): + """Test that identical SemanticMatch instances have the same hash.""" + match1 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + match2 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + + self.assertEqual(hash(match1), hash(match2), "Hashes of identical objects should be the same") + + def test_hash_uniqueness(self): + """Test that different SemanticMatch instances have different hashes.""" + match1 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + match2 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="C", + score=0.8, + path=["A"] + ) + + self.assertNotEqual(hash(match1), hash(match2), "Hashes of different objects should be different") + + def test_hash_set_usage(self): + """Test that SemanticMatch instances can be used in a set (ensuring uniqueness).""" + match1 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + match2 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + match3 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="C", + score=0.9, + path=["A"] + ) + + match_set = {match1, match2, match3} + self.assertEqual(len(match_set), 2, "Set should contain unique elements based on hash") + + def test_hash_dict_usage(self): + """Test that SemanticMatch instances can be used as dictionary keys.""" + match1 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + match2 = algorithm.SemanticMatch( + base_semantic_id="A", + match_semantic_id="B", + score=0.8, + path=["A"] + ) + + match_dict = {match1: "First Entry", match2: "Second Entry"} + + self.assertEqual(len(match_dict), 1, "Identical objects should overwrite each other in a dictionary") + self.assertEqual(match_dict[match1], "Second Entry", "Latest value should be stored in dictionary") + + +class TestFindSemanticMatches(unittest.TestCase): + def setUp(self): + """Set up test graphs for various cases.""" + self.graph = algorithm.SemanticMatchGraph() + + # Populate the graph + self.graph.add_edge("A", "B", weight=0.8) + self.graph.add_edge("B", "C", weight=0.7) + self.graph.add_edge("C", "D", weight=0.9) + self.graph.add_edge("B", "D", weight=0.6) + self.graph.add_edge("D", "E", weight=0.5) + + def test_basic_functionality(self): + """Test basic propagation of semantic matches.""" + matches: List[algorithm.SemanticMatch] = algorithm.find_semantic_matches(self.graph, "A", min_score=0) + str_matches: List[str] = [str(i) for i in matches] + expected = [ + f"A -> B = 0.8", + f"A -> B -> C = {0.8*0.7}", # 0.56 + f"A -> B -> C -> D = {0.8*0.7*0.9}", # 0.504 + f"A -> B -> D = {0.8*0.6}", # 0.48 + f"A -> B -> C -> D -> E = {0.8 * 0.7 * 0.9 * 0.5}", # 0.252 + f"A -> B -> D -> E = {0.8 * 0.6 * 0.5}", # 0.24 + ] + self.assertEqual(expected, str_matches) + + def test_loop_prevention(self): + """Ensure that loops do not cause infinite recursion.""" + self.graph.add_edge("D", "A", weight=0.4) # Creates a loop + + matches: List[algorithm.SemanticMatch] = algorithm.find_semantic_matches( + self.graph, + semantic_id="A", + min_score=0.1 + ) + + # We would expect the algorithm to have only traversed the graph exactly once! + self.assertEqual(6, len(matches)) + # For simplifying the analysis, we remove everything but the `semantic_id`s of the found matches + matched_semantic_ids: List[str] = [i.match_semantic_id for i in matches] + self.assertIn("D", matched_semantic_ids) + self.assertNotIn("A", matched_semantic_ids) # "A" should not be revisited + + def test_minimum_threshold(self): + """Ensure that results below the minimum score are excluded.""" + matches = algorithm.find_semantic_matches(self.graph, "A", min_score=0.6) + str_matches: List[str] = [str(i) for i in matches] + expected: List[str] = [ + "A -> B = 0.8" + ] + self.assertEqual(expected, str_matches) + + def test_not_in_graph(self): + """Test that matches that are not in the graph are not found""" + matches = algorithm.find_semantic_matches(self.graph, semantic_id="X", min_score=0) + self.assertEqual(0, len(matches)) + + def test_disconnected_graph(self): + """Test behavior when the graph has disconnected components.""" + graph_disconnected = algorithm.SemanticMatchGraph() + graph_disconnected.add_edge("A", "B", weight=0.8) + graph_disconnected.add_edge("X", "Y", weight=0.9) + + matches = algorithm.find_semantic_matches(graph_disconnected, "A", min_score=0.1) + str_matches: List[str] = [str(i) for i in matches] + expected: List[str] = [ + "A -> B = 0.8" + ] + + self.assertEqual(expected, str_matches) + + def test_empty_graph(self): + """Test behavior when the graph is empty.""" + graph_empty = algorithm.SemanticMatchGraph() + matches = algorithm.find_semantic_matches(graph_empty, "A", min_score=0.1) + + self.assertEqual(0, len(matches)) + + def test_single_node_no_edges(self): + """Test behavior when the graph has only one node and no edges.""" + graph_single = algorithm.SemanticMatchGraph() + graph_single.add_node("A") + + matches = algorithm.find_semantic_matches(graph_single, "A", min_score=0.1) + + self.assertEqual(0, len(matches)) + + def test_edge_case_weights(self): + """Test behavior with edge weights close to zero and one.""" + graph_edge_cases = algorithm.SemanticMatchGraph() + graph_edge_cases.add_edge("A", "B", weight=1.0, source="perfect match") + graph_edge_cases.add_edge("B", "C", weight=0.01, source="weak match") + + matches = algorithm.find_semantic_matches(graph_edge_cases, "A", min_score=0.01) + matches_str: List[str] = [str(i) for i in matches] + expected: List[str] = [ + f"A -> B = {1.0}", + f"A -> B -> C = {1.0*0.01}", + ] + self.assertEqual(expected, matches_str) + + def test_complex_graph(self): + """Test behavior in a complex graph with multiple branches.""" + graph_complex = algorithm.SemanticMatchGraph() + graph_complex.add_edge("A", "B", weight=0.9, source="dataset1") + graph_complex.add_edge("A", "C", weight=0.8, source="dataset2") + graph_complex.add_edge("B", "D", weight=0.7, source="dataset3") + graph_complex.add_edge("C", "D", weight=0.6, source="dataset4") + graph_complex.add_edge("D", "E", weight=0.5, source="dataset5") + + matches = algorithm.find_semantic_matches(graph_complex, "A", min_score=0.3) + matches_str: List[str] = [str(i) for i in matches] + expected: List[str] = [ + f"A -> B = {0.9}", # 0.9 + f"A -> C = {0.8}", # 0.8 + f"A -> B -> D = {0.9*0.7}", # 0.63 + f"A -> C -> D = {0.8*0.6}", # 0.48 + f"A -> B -> D -> E = {0.9*0.7*0.5}", # 0.315 + # f"A -> B -> C -> E = {0.8*0.6*0.5}", # 0.24 => out + ] + self.assertEqual(expected, matches_str) + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_resources/example_graph.json b/test/test_resources/example_graph.json new file mode 100644 index 0000000..395217b --- /dev/null +++ b/test/test_resources/example_graph.json @@ -0,0 +1,20 @@ +[ + { + "base_semantic_id": "https://s-heppner.com/semantic_id/one", + "match_semantic_id": "https://s-heppner.com/semantic_id/uno", + "score": 0.9, + "path": [] + }, + { + "base_semantic_id": "https://s-heppner.com/semantic_id/one", + "match_semantic_id": "https://remote.com/semantic_id/deux", + "score": 0.7, + "path": [] + }, + { + "base_semantic_id": "https://s-heppner.com/semantic_id/uno", + "match_semantic_id": "https://s-heppner.com/semantic_id/trois", + "score": 0.6, + "path": [] + } +] \ No newline at end of file diff --git a/test/test_semantic_matcher.py b/test/test_semantic_matcher.py index f7f8719..9584e1f 100644 --- a/test/test_semantic_matcher.py +++ b/test/test_semantic_matcher.py @@ -1,5 +1,4 @@ import os -import configparser import multiprocessing import requests @@ -8,8 +7,7 @@ from fastapi import FastAPI import uvicorn -from semantic_matcher import model -from semantic_matcher.model import SemanticMatch +from semantic_matcher import algorithm from semantic_matcher.service import SemanticMatchingService from contextlib import contextmanager @@ -20,28 +18,21 @@ def run_server(): - # Load test configuration - config = configparser.ConfigParser() - config.read([ - os.path.abspath(os.path.join(os.path.dirname(__file__), "../test_resources/config.ini")), - ]) - # Read in equivalence table - EQUIVALENCES = model.EquivalenceTable.from_file( + match_graph = algorithm.SemanticMatchGraph.from_file( filename=os.path.abspath(os.path.join( os.path.dirname(__file__), - "..", - config["SERVICE"]["equivalence_table_file"] + "test_resources/example_graph.json" )) ) # Initialise SemanticMatchingService semantic_matching_service = SemanticMatchingService( - endpoint=config["SERVICE"]["endpoint"], - equivalences=EQUIVALENCES + endpoint="localhost", + graph=match_graph ) - # Mock resolver + # Mock semantic_id Resolver def mock_get_matcher(self, semantic_id): return "http://remote-service:8000" @@ -57,12 +48,11 @@ def __init__(self, content: str, status_code: int = 200): def mock_requests_get(url, json): if url == "http://remote-service:8000/get_matches": - match_one = SemanticMatch( + match_one = algorithm.SemanticMatch( base_semantic_id="s-heppner.com/semanticID/three", match_semantic_id="remote-service.com/semanticID/tres", score=1.0, - meta_information={"matchSource": "Defined by Moritz Sommer", - "path": ["remote-service.com/semanticID/trois"]} + path=[] ) matches_data = { "matches": [match_one.model_dump()] @@ -77,7 +67,7 @@ def mock_requests_get(url, json): # Run server app = FastAPI() app.include_router(semantic_matching_service.router) - uvicorn.run(app, host=config["SERVICE"]["ENDPOINT"], port=int(config["SERVICE"]["PORT"]), log_level="error") + uvicorn.run(app, host="localhost", port=8000, log_level="error") @contextmanager @@ -94,139 +84,87 @@ def run_server_context(): os.kill(server_process.pid, signal.SIGKILL) server_process.join() - +# @unittest.skip("These tests need to be adapted") class TestSemanticMatchingService(unittest.TestCase): - def test_get_all_matches(self): with run_server_context(): response = requests.get("http://localhost:8000/all_matches") - expected_matches = { - 's-heppner.com/semanticID/one': [ - { - 'base_semantic_id': 's-heppner.com/semanticID/one', - 'match_semantic_id': 's-heppner.com/semanticID/1', - 'score': 1.0, - 'meta_information': {'matchSource': 'Defined by Sebastian Heppner'} - }, - { - 'base_semantic_id': 's-heppner.com/semanticID/one', - 'match_semantic_id': 's-heppner.com/semanticID/two', - 'score': 0.8, - 'meta_information': {'matchSource': 'Defined by Sebastian Heppner'} - } - ], - 's-heppner.com/semanticID/two': [ - { - 'base_semantic_id': 's-heppner.com/semanticID/two', - 'match_semantic_id': 's-heppner.com/semanticID/2', - 'score': 1.0, - 'meta_information': {'matchSource': 'Defined by Sebastian Heppner'} - } - ], - 's-heppner.com/semanticID/three': [ - { - 'base_semantic_id': 's-heppner.com/semanticID/three', - 'match_semantic_id': 'remote-service.com/semanticID/trois', - 'score': 1.0, - 'meta_information': {'matchSource': 'Defined by Moritz Sommer'} - } - ] - } + expected_matches = [ + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://s-heppner.com/semantic_id/uno', + 'path': [], + 'score': 0.9}, + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://remote.com/semantic_id/deux', + 'path': [], + 'score': 0.7}, + {'base_semantic_id': 'https://s-heppner.com/semantic_id/uno', + 'match_semantic_id': 'https://s-heppner.com/semantic_id/trois', + 'path': [], + 'score': 0.6} + ] actual_matches = response.json() self.assertEqual(expected_matches, actual_matches) def test_post_matches(self): with run_server_context(): new_match = { - "base_semantic_id": "s-heppner.com/semanticID/new", - "match_semantic_id": "s-heppner.com/semanticID/3", + "base_semantic_id": "https://s-heppner.com/semantic_id/new", + "match_semantic_id": "https://s-heppner.com/semantic_id/nouveaux", "score": 0.95, - "meta_information": {"matchSource": "Defined by UnitTest"} - } - matches_list = { - "matches": [new_match] + "path": [], } - requests.post( + response = requests.post( "http://localhost:8000/post_matches", - json=matches_list - ) - response = requests.get("http://localhost:8000/all_matches") - actual_matches = response.json() - self.assertIn("s-heppner.com/semanticID/new", actual_matches) - self.assertEqual( - actual_matches["s-heppner.com/semanticID/new"][0]["match_semantic_id"], - "s-heppner.com/semanticID/3" - ) - - self.assertEqual( - actual_matches["s-heppner.com/semanticID/new"][0]["score"], - 0.95 - ) - - self.assertEqual( - actual_matches["s-heppner.com/semanticID/new"][0]["meta_information"]["matchSource"], - "Defined by UnitTest" + json=[new_match] ) + self.assertEqual(200, response.status_code) + # Todo: Make sure this does not become a problem in other tests def test_get_matches_local_only(self): with run_server_context(): match_request = { - "semantic_id": "s-heppner.com/semanticID/one", + "semantic_id": "https://s-heppner.com/semantic_id/one", "score_limit": 0.5, "local_only": True } response = requests.get("http://localhost:8000/get_matches", json=match_request) - expected_matches = { - "matches": [ - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/1", - "score": 1.0, - "meta_information": {"matchSource": "Defined by Sebastian Heppner"} - }, - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/two", - "score": 0.8, - "meta_information": {"matchSource": "Defined by Sebastian Heppner"} - }, - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/2", - "score": 0.8, - "meta_information": {"matchSource": "Defined by Sebastian Heppner", - "path": ["s-heppner.com/semanticID/two"]} - } - ] - } + expected_matches = [ + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://s-heppner.com/semantic_id/uno', + 'path': ['https://s-heppner.com/semantic_id/one'], + 'score': 0.9}, + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://remote.com/semantic_id/deux', + 'path': ['https://s-heppner.com/semantic_id/one'], + 'score': 0.7}, + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://s-heppner.com/semantic_id/trois', + 'path': ['https://s-heppner.com/semantic_id/one', + 'https://s-heppner.com/semantic_id/uno'], + 'score': 0.54} + ] actual_matches = response.json() self.assertEqual(expected_matches, actual_matches) def test_get_matches_local_and_remote(self): with run_server_context(): match_request = { - "semantic_id": "s-heppner.com/semanticID/three", + "semantic_id": "https://s-heppner.com/semantic_id/one", "score_limit": 0.7, "local_only": False } response = requests.get("http://localhost:8000/get_matches", json=match_request) - expected_matches = { - "matches": [ - { - "base_semantic_id": "s-heppner.com/semanticID/three", - "match_semantic_id": "remote-service.com/semanticID/trois", - "score": 1.0, - "meta_information": {"matchSource": "Defined by Moritz Sommer"} - }, - { - "base_semantic_id": "s-heppner.com/semanticID/three", - "match_semantic_id": "remote-service.com/semanticID/tres", - "score": 1.0, - "meta_information": {"matchSource": "Defined by Moritz Sommer", - "path": ["remote-service.com/semanticID/trois"]} - }, - ] - } + expected_matches = [ + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://s-heppner.com/semantic_id/uno', + 'path': ['https://s-heppner.com/semantic_id/one'], + 'score': 0.9}, + {'base_semantic_id': 'https://s-heppner.com/semantic_id/one', + 'match_semantic_id': 'https://remote.com/semantic_id/deux', + 'path': ['https://s-heppner.com/semantic_id/one'], + 'score': 0.7} + ] actual_matches = response.json() self.assertEqual(expected_matches, actual_matches) @@ -238,74 +176,8 @@ def test_get_matches_no_matches(self): "local_only": True } response = requests.get("http://localhost:8000/get_matches", json=match_request) - expected_matches = {"matches": []} - actual_matches = response.json() - self.assertEqual(expected_matches, actual_matches) - - def test_get_matches_with_low_score_limit(self): - with run_server_context(): - match_request = { - "semantic_id": "s-heppner.com/semanticID/one", - "score_limit": 0.9, - "local_only": True - } - response = requests.get("http://localhost:8000/get_matches", json=match_request) - expected_matches = { - "matches": [ - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/1", - "score": 1.0, - "meta_information": {"matchSource": "Defined by Sebastian Heppner"} - } - ] - } actual_matches = response.json() - self.assertEqual(expected_matches, actual_matches) - - def test_get_matches_with_nlp_parameters(self): - with run_server_context(): - match_request = { - "semantic_id": "s-heppner.com/semanticID/one", - "score_limit": 0.5, - "local_only": True, - "name": "Example Name", - "definition": "Example Definition" - } - response = requests.get("http://localhost:8000/get_matches", json=match_request) - expected_matches = { - "matches": [ - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/1", - "score": 1.0, - "meta_information": {"matchSource": "Defined by Sebastian Heppner"} - }, - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/two", - "score": 0.8, - "meta_information": {"matchSource": "Defined by Sebastian Heppner"} - }, - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/2", - "score": 0.8, - "meta_information": {"matchSource": "Defined by Sebastian Heppner", - "path": ["s-heppner.com/semanticID/two"]} - } - ] - } - actual_matches = response.json() - self.assertEqual(expected_matches, actual_matches) - - def test_remove_all_matches(self): - with run_server_context(): - requests.post("http://localhost:8000/clear") - response = requests.get("http://localhost:8000/all_matches") - expected_matches = {} - actual_matches = response.json() - self.assertEqual(expected_matches, actual_matches) + self.assertEqual([], actual_matches) if __name__ == '__main__': diff --git a/test_resources/config.ini b/test_resources/config.ini deleted file mode 100644 index ae230aa..0000000 --- a/test_resources/config.ini +++ /dev/null @@ -1,8 +0,0 @@ -[SERVICE] -endpoint=127.0.0.1 -port=8000 -equivalence_table_file=./test_resources/equivalence_table.json - -[RESOLVER] -endpoint=http://semantic_id_resolver -port=8125 diff --git a/test_resources/equivalence_table.json b/test_resources/equivalence_table.json deleted file mode 100644 index 34c8ac6..0000000 --- a/test_resources/equivalence_table.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "matches": { - "s-heppner.com/semanticID/one": [ - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/1", - "score": 1.0, - "meta_information": { - "matchSource": "Defined by Sebastian Heppner" - } - }, - { - "base_semantic_id": "s-heppner.com/semanticID/one", - "match_semantic_id": "s-heppner.com/semanticID/two", - "score": 0.8, - "meta_information": { - "matchSource": "Defined by Sebastian Heppner" - } - } - ], - "s-heppner.com/semanticID/two": [ - { - "base_semantic_id": "s-heppner.com/semanticID/two", - "match_semantic_id": "s-heppner.com/semanticID/2", - "score": 1.0, - "meta_information": { - "matchSource": "Defined by Sebastian Heppner" - } - } - ], - "s-heppner.com/semanticID/three": [ - { - "base_semantic_id": "s-heppner.com/semanticID/three", - "match_semantic_id": "remote-service.com/semanticID/trois", - "score": 1.0, - "meta_information": { - "matchSource": "Defined by Moritz Sommer" - } - } - ] - } -} \ No newline at end of file From f1e225bb82817efcd10ad79b08520b21c18038db Mon Sep 17 00:00:00 2001 From: s-heppner Date: Fri, 7 Mar 2025 23:44:43 +0100 Subject: [PATCH 2/2] Fix static analysis --- pyproject.toml | 2 ++ semantic_matcher/algorithm.py | 4 +++- semantic_matcher/py.typed | 0 semantic_matcher/service.py | 3 --- semantic_matcher/visualization.py | 2 +- test/test_algorithm.py | 7 ++++--- test/test_semantic_matcher.py | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) create mode 100644 semantic_matcher/py.typed diff --git a/pyproject.toml b/pyproject.toml index 329857e..a56c928 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,8 @@ dev = [ "mypy", "pycodestyle", "coverage", + "types-networkx", # Type hints for networkx + "types-requests", # Type hints for requests ] [tool.setuptools] diff --git a/semantic_matcher/algorithm.py b/semantic_matcher/algorithm.py index 887db67..a64060e 100644 --- a/semantic_matcher/algorithm.py +++ b/semantic_matcher/algorithm.py @@ -132,7 +132,9 @@ def find_semantic_matches( # Traverse to the neighboring and therefore connected `semantic_id`s for neighbor, edge_data in graph[node].items(): - new_score: float = score * edge_data["weight"] # Multiplicative propagation + edge_weight = edge_data["weight"] + assert isinstance(edge_weight, float) + new_score: float = score * edge_weight # Multiplicative propagation # Prevent loops by ensuring we do not revisit the start node after the first iteration if neighbor == semantic_id: diff --git a/semantic_matcher/py.typed b/semantic_matcher/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/semantic_matcher/service.py b/semantic_matcher/service.py index d72927b..4635ce3 100644 --- a/semantic_matcher/service.py +++ b/semantic_matcher/service.py @@ -80,7 +80,6 @@ def get_all_matches(self): matches = self.graph.get_all_matches() return matches - def get_matches( self, request_body: MatchRequest @@ -134,8 +133,6 @@ def get_matches( score_limit=float(request_body.score_limit/match.score), # If we already request a remote score, it does not make sense to choose `local_only` local_only=False, - name=request_body.name, - definition=request_body.definition, already_checked_locations=already_checked_locations ) url = f"{remote_matching_service}/get_matches" diff --git a/semantic_matcher/visualization.py b/semantic_matcher/visualization.py index c2aac6d..2c0d862 100644 --- a/semantic_matcher/visualization.py +++ b/semantic_matcher/visualization.py @@ -1,4 +1,4 @@ -import matplotlib.pyplot as plt +import matplotlib.pyplot as plt # type: ignore import networkx as nx from semantic_matcher.algorithm import SemanticMatchGraph diff --git a/test/test_algorithm.py b/test/test_algorithm.py index da9f911..6dd1ef7 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -38,7 +38,7 @@ def test_get_all_matches_empty_graph(self): """Test that an empty graph returns an empty list.""" empty_graph = algorithm.SemanticMatchGraph() matches = empty_graph.get_all_matches() - self.assertEqual([], matches,"Empty graph should return an empty list.") + self.assertEqual([], matches, "Empty graph should return an empty list.") def test_get_all_matches_duplicate_edges(self): """Test handling of duplicate edges with different scores.""" @@ -46,13 +46,13 @@ def test_get_all_matches_duplicate_edges(self): matches = self.graph.get_all_matches() expected_matches = [ - algorithm.SemanticMatch(base_semantic_id="A", match_semantic_id="B", score=0.9, path=[]), # Overwritten edge + algorithm.SemanticMatch(base_semantic_id="A", match_semantic_id="B", score=0.9, path=[]), algorithm.SemanticMatch(base_semantic_id="B", match_semantic_id="C", score=0.6, path=[]), algorithm.SemanticMatch(base_semantic_id="C", match_semantic_id="D", score=0.9, path=[]), ] self.assertEqual(len(matches), 3, "Duplicate edge handling failed.") - self.assertCountEqual(expected_matches, matches,"Matches do not match expected results.") + self.assertCountEqual(expected_matches, matches, "Matches do not match expected results.") def test_get_all_matches_varying_weights(self): """Test that matches with different weights are retrieved correctly.""" @@ -352,5 +352,6 @@ def test_complex_graph(self): ] self.assertEqual(expected, matches_str) + if __name__ == "__main__": unittest.main() diff --git a/test/test_semantic_matcher.py b/test/test_semantic_matcher.py index 9584e1f..2aa3ae6 100644 --- a/test/test_semantic_matcher.py +++ b/test/test_semantic_matcher.py @@ -84,7 +84,7 @@ def run_server_context(): os.kill(server_process.pid, signal.SIGKILL) server_process.join() -# @unittest.skip("These tests need to be adapted") + class TestSemanticMatchingService(unittest.TestCase): def test_get_all_matches(self): with run_server_context():