Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
7eb6e32
CU-869bhm1zy: Gather data from plugins regarding the components they …
mart-r Dec 18, 2025
78d6057
CU-869bhm1zy: Add missing plugin registry
mart-r Dec 18, 2025
e83d5d0
CU-869bhm1zy: Separate version calculations to avoid circular imports
mart-r Dec 18, 2025
24fedcd
CU-869bhm1zy: Add a few tests for plugin loader
mart-r Dec 18, 2025
2998f85
CU-869bhm1zy: Add missing init file for test package
mart-r Dec 19, 2025
3a17523
CU-869bhm1zy: Add initial model card output for plugin and pipe descr…
mart-r Dec 19, 2025
5d24751
CU-869bhm1zy: Separate saving of model card to its own method
mart-r Dec 19, 2025
7e207dd
CU-869bhm1zy: Add tests for required plugins and pipeline descriptions
mart-r Dec 19, 2025
aeeb2b3
CU-869bhm1zy: Remove some commented code
mart-r Dec 19, 2025
38f92d3
CU-869bhm1zy: Add provider of each component to pipe description
mart-r Dec 19, 2025
b518a36
CU-869bhm1zy: Add a few integration tests for model card / pipe descr…
mart-r Dec 19, 2025
c32c679
CU-869bhm1zy: Add another integration tests regarding required plugins
mart-r Dec 19, 2025
4acd473
CU-869bhm1zy: Add plugin author and URL to model packs
mart-r Dec 19, 2025
09272d3
CU-869bhm1zy: Remove random usage of typing based collections for gen…
mart-r Dec 19, 2025
3b1c5e0
CU-869bhm1zy: Update CAT to raise an exception when a model is loaded…
mart-r Dec 19, 2025
a5056a0
CU-869bhm1zy: Add a few tests for loading model pack with unavailable…
mart-r Dec 19, 2025
62c0419
CU-869bhm1zy: Improving robustness of entry point metadata by using d…
mart-r Dec 19, 2025
87723c9
CU-869bhm1zy: Add a few simple loader tests
mart-r Dec 20, 2025
1f348ba
CU-869bhm1zy: Make better attempts at providing plugin URL
mart-r Dec 21, 2025
26bd96f
CU-869bhm1zy: Make better attempts at providing plugin auhor details
mart-r Dec 21, 2025
48e22ab
CU-869bhm1zy: Add fallback for checking plugin dependence with improp…
mart-r Dec 21, 2025
846125a
CU-869bhm1zy: Add a simple test for latest changes
mart-r Dec 21, 2025
67b7fce
CU-869bhm1zy: Move getter of component provider to plugin package
mart-r Dec 21, 2025
7d8feed
CU-869bhm1zy: Remove unnecessary argument from method
mart-r Dec 21, 2025
d3c3c64
CU-869bhm1zy: Add late registration to of components for plugins upon…
mart-r Dec 21, 2025
5050dec
CU-869bhm1zy: Add missing module
mart-r Dec 21, 2025
5d95537
CU-869bhm1zy: Fix typing issue for 3.11 and 3.12
mart-r Dec 21, 2025
a6c3fdd
CU-869bhm1zy: Fix linting issue
mart-r Dec 21, 2025
7ef7e45
CU-869bhm1zy: Fix addon tests
mart-r Dec 21, 2025
9e75390
CU-869bhm1zy: Fix addon tests issue
mart-r Dec 21, 2025
d07e38a
CU-869bhm1zy: Hopefully final fix for tests
mart-r Dec 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions medcat-v2/medcat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
from importlib.metadata import version as __version_method
from importlib.metadata import PackageNotFoundError as __PackageNotFoundError

from medcat.version import __version__
from medcat.utils.check_for_updates import (
check_for_updates as __check_for_updates)

from medcat.plugins import load_plugins as __load_plugins

try:
__version__ = __version_method("medcat")
except __PackageNotFoundError:
__version__ = "0.0.0-dev"


# NOTE: this will not always actually do the check
# it will only (by default) check once a week
__check_for_updates("medcat", __version__)
Expand Down
146 changes: 125 additions & 21 deletions medcat-v2/medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,21 @@
from medcat.tokenizing.tokens import MutableDocument, MutableEntity
from medcat.tokenizing.tokenizers import SaveableTokenizer, TOKENIZER_PREFIX
from medcat.data.entities import Entity, Entities, OnlyCUIEntities
from medcat.data.model_card import ModelCard
from medcat.data.model_card import ModelCard, PipelineDescription
from medcat.data.model_card import RequiredPluginDescription
from medcat.components.types import AbstractCoreComponent, HashableComponent
from medcat.components.types import CoreComponent
from medcat.components.addons.addons import AddonComponent
from medcat.utils.legacy.identifier import is_legacy_model_pack
from medcat.utils.defaults import avoid_legacy_conversion
from medcat.utils.defaults import doing_legacy_conversion_message
from medcat.utils.defaults import LegacyConversionDisabledError
from medcat.utils.usage_monitoring import UsageMonitor, _NoDelUM
from medcat.utils.import_utils import MissingDependenciesError
from medcat.plugins.registry import plugin_registry, find_provider
import importlib.util
from medcat.utils.exceptions import MissingPluginError, MissingPluginInfo



logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -655,6 +661,11 @@ def trainer(self):
self._trainer = Trainer(self.cdb, self.__call__, self._pipeline)
return self._trainer

def save_model_card(self, model_card_path: str) -> None:
model_card: str = self.get_model_card(as_dict=False)
with open(model_card_path, 'w') as f:
f.write(model_card)

def save_model_pack(
self, target_folder: str, pack_name: str = DEFAULT_PACK_NAME,
serialiser_type: Union[str, AvailableSerialisers] = 'dill',
Expand Down Expand Up @@ -705,10 +716,7 @@ def save_model_pack(
self.config.general.nlp.modelname = internals_path
# serialise
serialise(serialiser_type, self, model_pack_path)
model_card: str = self.get_model_card(as_dict=False)
model_card_path = os.path.join(model_pack_path, "model_card.json")
with open(model_card_path, 'w') as f:
f.write(model_card)
self.save_model_card(os.path.join(model_pack_path, "model_card.json"))
# components
components_folder = os.path.join(
model_pack_path, COMPONENTS_FOLDER)
Expand Down Expand Up @@ -777,6 +785,33 @@ def attempt_unpack(cls, zip_path: str) -> str:
shutil.unpack_archive(zip_path, extract_dir=model_pack_path)
return model_pack_path

@classmethod
def _get_missing_plugins(cls, model_pack_path: str) -> list[MissingPluginInfo]:
model_card = cls.load_model_card_off_disk(model_pack_path, as_dict=True)
required_plugins: list[
RequiredPluginDescription] = model_card.get("Required Plugins", [])
missing_plugins: list[MissingPluginInfo] = []

for plugin_info in required_plugins:
# Check if the plugin module can be imported
if importlib.util.find_spec(plugin_info["name"]) is None:
# Cast to str for safety
provided = [(str(p[0]), str(p[1])) for p in plugin_info["provides"]]
missing_plugins.append(MissingPluginInfo(
name=plugin_info["name"],
provides=provided,
author=plugin_info.get("author"),
url=plugin_info.get("url"),
))

if missing_plugins:
logger.warning(
"Missing required plugins for this model pack. "
"Attempting to load anyway, but it may fail. "
f"Missing: {[p['name'] for p in missing_plugins]}"
)
return missing_plugins

@classmethod
def load_model_pack(cls, model_pack_path: str,
config_dict: Optional[dict] = None,
Expand All @@ -796,6 +831,7 @@ def load_model_pack(cls, model_pack_path: str,

Raises:
ValueError: If the saved data does not represent a model pack.
MissingPluginError: If required plugins are missing for this model pack.

Returns:
CAT: The loaded model pack.
Expand All @@ -812,22 +848,32 @@ def load_model_pack(cls, model_pack_path: str,
return Converter(model_pack_path, None).convert()
elif is_legacy and avoid_legacy:
raise LegacyConversionDisabledError("CAT")
# NOTE: ignoring addons since they will be loaded later / separately
cat = deserialise(model_pack_path, model_load_path=model_pack_path,
ignore_folders_prefix={
AddonComponent.NAME_PREFIX,
# NOTE: will be loaded manually
AbstractCoreComponent.NAME_PREFIX,
# tokenizer stuff internals are loaded separately
# if appropraite
TOKENIZER_PREFIX,
# components will be loaded semi-manually
# within the creation of pipe
COMPONENTS_FOLDER,
# ignore hidden files/folders
'.'},
config_dict=config_dict,
addon_config_dict=addon_config_dict)

# Load model card to check for required plugins
missing_plugins = cls._get_missing_plugins(model_pack_path)

try:
# NOTE: ignoring addons since they will be loaded later / separately
cat = deserialise(model_pack_path, model_load_path=model_pack_path,
ignore_folders_prefix={
AddonComponent.NAME_PREFIX,
# NOTE: will be loaded manually
AbstractCoreComponent.NAME_PREFIX,
# tokenizer stuff internals are loaded separately
# if appropraite
TOKENIZER_PREFIX,
# components will be loaded semi-manually
# within the creation of pipe
COMPONENTS_FOLDER,
# ignore hidden files/folders
'.'},
config_dict=config_dict,
addon_config_dict=addon_config_dict)
except ImportError as e:
if missing_plugins:
raise MissingPluginError(missing_plugins) from e
raise

# NOTE: deserialising of components that need serialised
# will be dealt with upon pipeline creation automatically
if not isinstance(cat, CAT):
Expand Down Expand Up @@ -924,13 +970,22 @@ def get_model_card(self, as_dict: bool = False) -> Union[str, ModelCard]:
else:
met_cat_model_cards = []
cdb_info = self.cdb.get_basic_info()

# Pipeline Description
pipeline_description = self.describe_pipeline()

# Required Plugins
required_plugins = self.get_required_plugins()

model_card: ModelCard = {
'Model ID': self.config.meta.hash,
'Last Modified On': self.config.meta.last_saved.isoformat(),
'History (from least to most recent)': self.config.meta.history,
'Description': self.config.meta.description,
'Source Ontology': self.config.meta.ontology,
'Location': self.config.meta.location,
'Pipeline Description': pipeline_description,
'Required Plugins': required_plugins,
'MetaCAT models': met_cat_model_cards,
'Basic CDB Stats': cdb_info,
'Performance': {}, # TODO
Expand All @@ -943,6 +998,55 @@ def get_model_card(self, as_dict: bool = False) -> Union[str, ModelCard]:
return model_card
return json.dumps(model_card, indent=2, sort_keys=False)


def describe_pipeline(self) -> PipelineDescription:
pipeline_description: PipelineDescription = {"core": {}, "addons": []}

for component in self._pipeline.iter_all_components():
provider = find_provider(component)

if component.is_core():
core_comp = cast(CoreComponent, component)
pipeline_description["core"][core_comp.get_type().name] = {
"name": component.name,
"provider": provider,
}
else:
pipeline_description["addons"].append({
"name": component.name,
"provider": provider,
})
return pipeline_description

def get_required_plugins(self) -> list[RequiredPluginDescription]:
# get plugins based on pipe
req_plugins: dict[str, list[tuple[str, str]]] = {}
pipe_descr = self.describe_pipeline()
core_comps = list(pipe_descr["core"].items())
addons = [("addon", addon) for addon in pipe_descr["addons"]]
for comp_type, comp in core_comps + addons:
provider = comp["provider"]
if provider == "medcat":
continue
if provider not in req_plugins:
req_plugins[provider] = []
req_plugins[provider].append((comp_type, comp["name"]))
# map to plugin info
out_plugins: list[RequiredPluginDescription] = []
for plugin_name, comp_names in req_plugins.items():
plugin_info = plugin_registry.get_plugin_info(plugin_name)
if plugin_info is None:
continue
out_plugins.append(
{
"name": plugin_name,
"provides": comp_names,
"author": plugin_info.author,
"url": plugin_info.url,
}
)
return out_plugins

@overload
@classmethod
def load_model_card_off_disk(cls, model_pack_path: str,
Expand Down
4 changes: 4 additions & 0 deletions medcat-v2/medcat/components/addons/addons.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,7 @@ def create_addon(
"""
return get_addon_creator(addon_name)(
cnf, tokenizer, cdb, vocab, model_load_path)


def get_registered_addons() -> list[tuple[str, str]]:
return _ADDON_REGISTRY.list_components()
2 changes: 1 addition & 1 deletion medcat-v2/medcat/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from pydantic import BaseModel, Field, ValidationError, ConfigDict

from medcat import __version__ as medcat_version
from medcat.version import __version__ as medcat_version
from medcat.utils.defaults import workers
from medcat.utils.envsnapshot import Environment, get_environment_info
from medcat.utils.iterutils import callback_iterator
Expand Down
17 changes: 17 additions & 0 deletions medcat-v2/medcat/data/model_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@
}
)

class ComponentDescription(TypedDict):
name: str
provider: str

class PipelineDescription(TypedDict):
core: dict[str, ComponentDescription]
addons: list[ComponentDescription]


class RequiredPluginDescription(TypedDict):
name: str
provides: list[tuple[str, str]]
author: str | None
url: str | None


ModelCard = TypedDict(
"ModelCard", {
Expand All @@ -23,6 +38,8 @@
'Description': str,
'Source Ontology': list[str],
'Location': str,
'Pipeline Description': PipelineDescription,
'Required Plugins': list[RequiredPluginDescription],
'MetaCAT models': list[dict],
'Basic CDB Stats': CDBInfo,
'Performance': dict[str, Any],
Expand Down
12 changes: 6 additions & 6 deletions medcat-v2/medcat/model_creation/preprocess_snomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import hashlib
import pandas as pd
from typing import Dict, List, Optional, Tuple
from typing import Optional
from dataclasses import dataclass, field
from enum import Enum, auto

Expand Down Expand Up @@ -189,12 +189,12 @@ class SupportedExtension(Enum):

@dataclass
class BundleDescriptor:
extensions: List[SupportedExtension]
ignores: Dict[RefSetFileType, List[SupportedExtension]] = field(
extensions: list[SupportedExtension]
ignores: dict[RefSetFileType, list[SupportedExtension]] = field(
default_factory=dict)

def has_invalid(self, ext: SupportedExtension,
file_types: Tuple[RefSetFileType]) -> bool:
file_types: tuple[RefSetFileType]) -> bool:
for ft in file_types:
if ft not in self.ignores:
continue
Expand All @@ -217,8 +217,8 @@ class SupportedBundles(Enum):
)


def match_partials_with_folders(exp_names: List[Tuple[str, Optional[str]]],
folder_names: List[str],
def match_partials_with_folders(exp_names: list[tuple[str, Optional[str]]],
folder_names: list[str],
_group_nr1: int = 1, _group_nr2: int = 2
) -> bool:
if len(exp_names) > len(folder_names):
Expand Down
Loading
Loading