-
Notifications
You must be signed in to change notification settings - Fork 1
feat(sayt): Add SAYTBuilder that constructs runtime artefacts for later use #71
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Tom-Owen-ONS
merged 11 commits into
main
from
SA-694-load-vector-store-db-from-parquet-on-runtime-v2
Jun 19, 2026
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
d5ecf20
feat(sayt): Add SAYTBuilder that constructs runtime artifacts for latβ¦
Tom-Owen-ONS adc7500
chore: remove unused ruff noqa
Tom-Owen-ONS f5c859c
refactor(sayt): remove SAYTConfig
Tom-Owen-ONS 9aa864e
feat(sayt): export full config in get_config() method
Tom-Owen-ONS 5a2f260
test(sayt): ensure test coverage of new sayt code
Tom-Owen-ONS 70cf728
chore(sayt): rename sayt modules to remove sayt_ prefix
Tom-Owen-ONS dce6a33
refactor(sayt): use built-in artifact persistence only for now
Tom-Owen-ONS 33c825d
chore: remove tuple sorting
Tom-Owen-ONS 1d09299
refactor: build to temp directory first, then move on success
Tom-Owen-ONS 9a74c8b
refactor: use PrivateAttrs for derived fields in CleanCorpus
Tom-Owen-ONS aa1c2ae
refactor: use a shared base class for duplicate init and from_csv
Tom-Owen-ONS File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| """Build and reload a persisted SAYT artifact with the built-in retrievers.""" | ||
|
|
||
| # pylint: disable=duplicate-code | ||
|
|
||
| # %% | ||
| import json | ||
| from pathlib import Path | ||
| from tempfile import TemporaryDirectory | ||
|
|
||
| from industrial_classification_utils.sayt import ( | ||
| NgramRetrieverSpec, | ||
| PrefixRetrieverSpec, | ||
| SAYTBuilder, | ||
| SAYTSuggester, | ||
| SemanticRetrieverSpec, | ||
| ) | ||
|
|
||
| # %% | ||
| ############# toy example to verify SAYT artifact build/load works ############# | ||
| small_corpus = [ | ||
| ("Car wash", "Car Wash"), | ||
| ("Car wash", "CAR WASH (duplicate)"), | ||
| ("Car waxing", "Car Waxing"), | ||
| ("Waxing car", "Car Waxing"), | ||
| ("Carpentry services", "Carpentry services"), | ||
| ("Dog grooming", "Dog grooming"), | ||
| ("Cat grooming", "Cat grooming"), | ||
| ("USed car sales", "Used car sales"), | ||
| ("Car rental", "Car rental"), | ||
| ("Car repair", "Car repair"), | ||
| ("Car servicing", "Car servicing"), | ||
| ] | ||
|
|
||
| retrievers = [ | ||
| PrefixRetrieverSpec(), | ||
| NgramRetrieverSpec(max_df=0.8), | ||
| SemanticRetrieverSpec(), | ||
| ] | ||
|
|
||
| # Keep the temporary directory alive across notebook cells. | ||
| # pylint: disable-next=consider-using-with | ||
| temp_dir = TemporaryDirectory(prefix="sayt_artifact_demo_") | ||
| artifact_dir = Path(temp_dir.name) / "car_services_sayt" | ||
| print("artifact will be written to:", artifact_dir) | ||
|
|
||
| # %% | ||
| # Semantic artifact builds may take longer the first time if the model cache | ||
| # needs to be created locally. | ||
| artifact_path = SAYTBuilder( | ||
| small_corpus, | ||
| retrievers=retrievers, | ||
| min_chars=3, | ||
| max_suggestions=5, | ||
| ).build_artifact(artifact_dir, overwrite=True) | ||
|
|
||
| print("artifact saved to:", artifact_path) | ||
| print("artifact files:") | ||
| for path in sorted(artifact_path.rglob("*")): | ||
| if path.is_file(): | ||
| print("-", path.relative_to(artifact_path)) | ||
|
|
||
| # %% | ||
| manifest = json.loads((artifact_path / "manifest.json").read_text(encoding="utf-8")) | ||
| print(json.dumps(manifest, indent=2)) | ||
|
|
||
| # %% | ||
| live_suggester = SAYTSuggester( | ||
| small_corpus, | ||
| retrievers=retrievers, | ||
| min_chars=3, | ||
| max_suggestions=5, | ||
| ) | ||
| loaded_suggester = SAYTSuggester.from_artifact(artifact_path) | ||
|
|
||
| for query in ["car", "cars", "waxi", "grom", "wash", "duplicate", "auto"]: | ||
| live_suggestions = live_suggester.suggest(query, 5) | ||
| loaded_suggestions = loaded_suggester.suggest(query, 5) | ||
|
|
||
| print("searching for:", query) | ||
| print("live", "->", live_suggestions) | ||
| print("loaded", "->", loaded_suggestions) | ||
| print("loaded_scores", "->", loaded_suggester.suggest_with_scores(query, 5)) | ||
| if live_suggestions != loaded_suggestions: | ||
| raise RuntimeError("Loaded suggester results did not match live build") | ||
| print() | ||
|
|
||
| # %% | ||
| # Run `temp_dir.cleanup()` when you are finished exploring the saved files. | ||
| print("artifact ready for inspection:", artifact_path) | ||
|
|
||
| # %% | ||
| temp_dir.cleanup() | ||
|
|
||
| # %% |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
|
ivyONS marked this conversation as resolved.
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| """Shared bootstrap helpers for corpus-bound SAYT classes.""" | ||
|
|
||
| import os | ||
| from collections.abc import Iterable, Sequence | ||
|
|
||
| from .core import CleanCorpus, validate_max_suggestions, validate_min_chars | ||
| from .retriever_specs import RetrieverSpec, default_retriever_specs | ||
| from .storage import load_corpus_from_csv | ||
|
|
||
|
|
||
| class BaseCorpusBound: # pylint: disable=too-few-public-methods | ||
| """Shared corpus/retriever bootstrap for SAYT runtime classes.""" | ||
|
|
||
| _corpus: CleanCorpus | ||
| _min_chars: int | ||
| _max_suggestions: int | ||
| _retriever_specs: tuple[RetrieverSpec, ...] | ||
|
|
||
| def __init__( | ||
| self, | ||
| corpus: Iterable[tuple[object, object]] | Iterable[str], | ||
| *, | ||
| retrievers: Sequence[RetrieverSpec] | None = None, | ||
| min_chars: int = 4, | ||
| max_suggestions: int = 10, | ||
| ) -> None: | ||
| """Validate and store the shared corpus-bound SAYT configuration.""" | ||
| self._corpus = CleanCorpus.model_validate(corpus) | ||
| self._min_chars = validate_min_chars(min_chars) | ||
| self._max_suggestions = validate_max_suggestions(max_suggestions) | ||
| self._retriever_specs = tuple( | ||
| default_retriever_specs() if retrievers is None else retrievers | ||
| ) | ||
|
|
||
| @classmethod | ||
| def from_csv[ # pylint: disable=too-many-arguments # noqa: PLR0913 | ||
| CorpusBoundT: "BaseCorpusBound" | ||
| ]( | ||
| cls: type[CorpusBoundT], | ||
| file_path: str | os.PathLike, | ||
| *, | ||
| search_text_col: str = "title", | ||
| display_text_col: str | None = None, | ||
| retrievers: Sequence[RetrieverSpec] | None = None, | ||
| min_chars: int = 4, | ||
| max_suggestions: int = 10, | ||
| ) -> CorpusBoundT: | ||
| """Build a corpus-bound SAYT object from CSV input.""" | ||
| corpus_rows = load_corpus_from_csv( | ||
| file_path, | ||
| search_text_col=search_text_col, | ||
| display_text_col=display_text_col, | ||
| ) | ||
| return cls( | ||
| corpus_rows, | ||
| retrievers=retrievers, | ||
| min_chars=min_chars, | ||
| max_suggestions=max_suggestions, | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| """Offline artifact builder for persisted SAYT runtime assets.""" | ||
|
|
||
| import os | ||
| import shutil | ||
| import tempfile | ||
| from pathlib import Path | ||
| from uuid import uuid4 | ||
|
|
||
| from ._base import BaseCorpusBound | ||
| from .storage import ( | ||
| build_artifact_manifest, | ||
| build_retriever_artifact, | ||
| write_artifact_corpus, | ||
| write_artifact_manifest, | ||
| ) | ||
|
|
||
|
|
||
| def _remove_path(path: Path) -> None: | ||
| if not path.exists(): | ||
| return | ||
| if path.is_dir(): | ||
| shutil.rmtree(path) | ||
| else: | ||
| path.unlink() | ||
|
|
||
|
|
||
| class SAYTBuilder(BaseCorpusBound): | ||
| """Build a persisted SAYT artifact for later runtime loading.""" | ||
|
|
||
| def build_artifact( | ||
| self, | ||
| output_dir: str | os.PathLike, | ||
| *, | ||
| overwrite: bool = False, | ||
| ) -> Path: | ||
| """Persist the current SAYT configuration and dense stores to disk.""" | ||
| artifact_dir = Path(output_dir) | ||
| if artifact_dir.exists() and not overwrite: | ||
| raise FileExistsError("Artifact directory already exists") | ||
|
|
||
| artifact_dir.parent.mkdir(parents=True, exist_ok=True) | ||
| staged_dir = Path( | ||
| tempfile.mkdtemp( | ||
| prefix=f".{artifact_dir.name}.tmp-", | ||
| dir=artifact_dir.parent, | ||
| ) | ||
| ) | ||
|
|
||
| try: | ||
| manifest = build_artifact_manifest( | ||
| corpus=self._corpus, | ||
| min_chars=self._min_chars, | ||
| max_suggestions=self._max_suggestions, | ||
| retriever_specs=self._retriever_specs, | ||
| ) | ||
|
|
||
| write_artifact_corpus(self._corpus, artifact_dir=staged_dir) | ||
| for stored_retriever in manifest.retrievers: | ||
| build_retriever_artifact( | ||
| corpus=self._corpus, | ||
| min_chars=self._min_chars, | ||
| stored_retriever=stored_retriever, | ||
| artifact_dir=staged_dir, | ||
| ) | ||
|
|
||
| write_artifact_manifest(manifest, artifact_dir=staged_dir) | ||
|
|
||
| if artifact_dir.exists(): | ||
| backup_dir = ( | ||
| artifact_dir.parent / f".{artifact_dir.name}.bak-{uuid4().hex}" | ||
| ) | ||
| artifact_dir.rename(backup_dir) | ||
| try: | ||
| staged_dir.rename(artifact_dir) | ||
| except Exception: | ||
| backup_dir.rename(artifact_dir) | ||
| raise | ||
| _remove_path(backup_dir) | ||
| else: | ||
| staged_dir.rename(artifact_dir) | ||
| except Exception: | ||
| _remove_path(staged_dir) | ||
| raise | ||
|
|
||
| return artifact_dir |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.