From 23647b628a76fc4ba14d11a40576b3fe45057a2a Mon Sep 17 00:00:00 2001 From: mart-r Date: Sun, 21 Dec 2025 23:25:11 +0000 Subject: [PATCH 1/3] CU-869bj8g9k: Fix hardcoded requirement for spacy model download. So far, if there was no Internet access, the fallback spacy model download failure would raise an exception and stall the entire process. Most models should come with their own spacy model anyway (if that's what they use). So the fallback model shouldn't be needed most of the time. So this PR allows the subprocess for spacy model download to fail if there's a network issue. This should make it easier to use the library in scenarios where this method is called. This normally happens if/when a model is created from scratch and no on-disk model is provided. But it can alsoaffects converting models from v1 to v2 format. --- medcat-v2/medcat/tokenizing/spacy_impl/utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/utils.py b/medcat-v2/medcat/tokenizing/spacy_impl/utils.py index 8989abd6e..506dbe02e 100644 --- a/medcat-v2/medcat/tokenizing/spacy_impl/utils.py +++ b/medcat-v2/medcat/tokenizing/spacy_impl/utils.py @@ -35,4 +35,17 @@ def ensure_spacy_model(model_name: str) -> None: cmd = f"{sys.executable} -m spacy download {model_name}" logger.info("Installing the spacy model %s using the CLI command " "'%s'", model_name, cmd) - subprocess.run(cmd.split(" "), check=True) + try: + subprocess.run( + cmd.split(" "), check=True, capture_output=True, text=True) + except subprocess.CalledProcessError as err: + if ("requests.exceptions.ConnectionError" in err.stderr and + "Failed to resolve" in err.stderr): + logger.warning( + "Unable to ensure the existing of spacy model '%s'. " + "Internet seems to be unavailable. If the model " + "does not provide its own implementation (it should), " + "subsequent usage may prove problematic. Underlying error:\n" + "%s", model_name, err.stderr, exc_info=err) + else: + raise err From 05cad54ed8d03174ff89d06da8f725ecc0bbee05 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 22 Dec 2025 00:08:02 +0000 Subject: [PATCH 2/3] CU-869bj8g9k: Fix underlying conversion issues that were forcing the downloading of models instead of using the one off disk --- .../medcat/utils/legacy/conversion_all.py | 11 ++++++++++- .../medcat/utils/legacy/convert_config.py | 19 ++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/utils/legacy/conversion_all.py b/medcat-v2/medcat/utils/legacy/conversion_all.py index 7d7455279..094848be8 100644 --- a/medcat-v2/medcat/utils/legacy/conversion_all.py +++ b/medcat-v2/medcat/utils/legacy/conversion_all.py @@ -7,9 +7,11 @@ from medcat.components.types import CoreComponentType from medcat.storage.serialisers import AvailableSerialisers from medcat.components.linking.no_action_linker import NoActionLinker +from medcat.utils.config_utils import temp_changed_config from medcat.utils.legacy.convert_cdb import get_cdb_from_old from medcat.utils.legacy.convert_config import get_config_from_old +from medcat.utils.legacy.convert_config import fix_spacy_model_name from medcat.utils.legacy.convert_vocab import get_vocab_from_old from medcat.utils.legacy.helpers import fix_subnames @@ -79,7 +81,14 @@ def convert(self) -> CAT: config = get_config_from_old(cnf_path) else: config = cdb.config - cat = CAT(cdb, vocab, config) + with temp_changed_config( + config.general.nlp, "modelname", + os.path.join(self.old_model_folder, + config.general.nlp.modelname)): + cat = CAT(cdb, vocab, config) + # NOTE: its probably easier if we change the spacy model name + # afterwards + fix_spacy_model_name(config, cat.pipe.tokenizer) fix_subnames(cat) # MetaCATs meta_cat_folders = [ diff --git a/medcat-v2/medcat/utils/legacy/convert_config.py b/medcat-v2/medcat/utils/legacy/convert_config.py index 3ed0f060a..b1853984c 100644 --- a/medcat-v2/medcat/utils/legacy/convert_config.py +++ b/medcat-v2/medcat/utils/legacy/convert_config.py @@ -8,6 +8,7 @@ from medcat.utils.legacy.helpers import fix_old_style_cnf from medcat.config.config import SerialisableBaseModel +from medcat.tokenizing.tokenizers import BaseTokenizer logger = logging.getLogger(__name__) @@ -126,8 +127,8 @@ def _relocate(cnf: Config, old_data: dict) -> Config: orig_val = cast(Any, orig_val) target_model = cast(BaseModel, target_model) fname = new_path.split(".")[-1] - logger.info("Relocating from %s to %s (%s)", orig_path, new_path, - type(orig_val).__name__) + logger.info("Relocating from %s to %s (%s) [%s]", orig_path, new_path, + type(orig_val).__name__, orig_val) _safe_setattr(target_model, fname, orig_val) return cnf @@ -167,13 +168,25 @@ def get_config_from_nested_dict(old_data: dict) -> Config: # but we now default to regex cnf.general.nlp.provider = 'spacy' cnf = _make_changes(cnf, old_data) + return cnf + + +def fix_spacy_model_name( + cnf: Config, + tokenizer: BaseTokenizer | None = None) -> None: if cnf.general.nlp.modelname in ('spacy_model', 'en_core_sci_md', 'en_core_sci_lg'): logger.info("Fixing spacy model. " "Moving from '%s' to 'en_core_web_md'!", cnf.general.nlp.modelname) cnf.general.nlp.modelname = 'en_core_web_md' - return cnf + # NOTE: the tokenizer uses an internally cached name that we need to + # fix here as well so that the name of the subsequently saved + # files is more descriptive than just 'spacy_model' + if tokenizer: + from medcat.tokenizing.spacy_impl.tokenizers import SpacyTokenizer + cast(SpacyTokenizer, + tokenizer)._spacy_model_name = cnf.general.nlp.modelname def get_config_from_old(path: str) -> Config: From b3e58a5246e7e24328e92706061565e5469f0752 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 22 Dec 2025 00:37:45 +0000 Subject: [PATCH 3/3] CU-869bhm1zy: Allow spacy model fix at CDB conversion time (if config is being loaded) --- medcat-v2/medcat/utils/legacy/conversion_all.py | 3 ++- medcat-v2/medcat/utils/legacy/convert_cdb.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/utils/legacy/conversion_all.py b/medcat-v2/medcat/utils/legacy/conversion_all.py index 094848be8..e0e0125b4 100644 --- a/medcat-v2/medcat/utils/legacy/conversion_all.py +++ b/medcat-v2/medcat/utils/legacy/conversion_all.py @@ -69,7 +69,8 @@ def convert(self) -> CAT: CAT: The model pack. """ cdb = get_cdb_from_old( - os.path.join(self.old_model_folder, self.cdb_name)) + os.path.join(self.old_model_folder, self.cdb_name), + fix_spacy_model_name=False) vocab_path = os.path.join(self.old_model_folder, self.vocab_name) if os.path.exists(vocab_path): vocab = get_vocab_from_old(vocab_path) diff --git a/medcat-v2/medcat/utils/legacy/convert_cdb.py b/medcat-v2/medcat/utils/legacy/convert_cdb.py index 119533599..8a57cd7ab 100644 --- a/medcat-v2/medcat/utils/legacy/convert_cdb.py +++ b/medcat-v2/medcat/utils/legacy/convert_cdb.py @@ -5,6 +5,8 @@ from medcat.config import Config from medcat.cdb.concepts import get_new_cui_info, get_new_name_info, TypeInfo from medcat.utils.legacy.convert_config import get_config_from_nested_dict +from medcat.utils.legacy.convert_config import ( + fix_spacy_model_name as apply_spacy_model_fix) logger = logging.getLogger(__name__) @@ -209,11 +211,14 @@ def update_names(cdb: CDB, data: dict): setattr(cdb, name_to, data[name_from]) -def convert_data(all_data: dict) -> CDB: +def convert_data(all_data: dict, fix_spacy_model_name: bool = True) -> CDB: """Convert the raw v1 data into a CDB. Args: all_data (dict): The raw v1 data off disk. + fix_spacy_model_name (bool): Whether to fix the spacy model name. + Older models may have unsuported spacy model names. So these + may sometimes need to be fixed. Defaults to True. Returns: CDB: The v2 CDB. @@ -226,17 +231,23 @@ def convert_data(all_data: dict) -> CDB: if 'config' in all_data: logger.info("Loading old style CDB with config included.") cdb.config = get_config_from_nested_dict(all_data['config']) + if fix_spacy_model_name: + apply_spacy_model_fix(cdb.config) return cdb -def get_cdb_from_old(old_path: str) -> CDB: +def get_cdb_from_old(old_path: str, + fix_spacy_model_name: bool = True) -> CDB: """Get the v2 CDB from a v1 CDB path. Args: old_path (str): The v1 CDB path. + fix_spacy_model_name (bool): Whether to fix the spacy model name. + Older models may have unsuported spacy model names. So these + may sometimes need to be fixed. Defaults to True. Returns: CDB: The v2 CDB. """ data = load_old_raw_data(old_path) - return convert_data(data) + return convert_data(data, fix_spacy_model_name)