diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/utils.py b/medcat-v2/medcat/tokenizing/spacy_impl/utils.py index 8989abd6e..506dbe02e 100644 --- a/medcat-v2/medcat/tokenizing/spacy_impl/utils.py +++ b/medcat-v2/medcat/tokenizing/spacy_impl/utils.py @@ -35,4 +35,17 @@ def ensure_spacy_model(model_name: str) -> None: cmd = f"{sys.executable} -m spacy download {model_name}" logger.info("Installing the spacy model %s using the CLI command " "'%s'", model_name, cmd) - subprocess.run(cmd.split(" "), check=True) + try: + subprocess.run( + cmd.split(" "), check=True, capture_output=True, text=True) + except subprocess.CalledProcessError as err: + if ("requests.exceptions.ConnectionError" in err.stderr and + "Failed to resolve" in err.stderr): + logger.warning( + "Unable to ensure the existing of spacy model '%s'. " + "Internet seems to be unavailable. If the model " + "does not provide its own implementation (it should), " + "subsequent usage may prove problematic. Underlying error:\n" + "%s", model_name, err.stderr, exc_info=err) + else: + raise err diff --git a/medcat-v2/medcat/utils/legacy/conversion_all.py b/medcat-v2/medcat/utils/legacy/conversion_all.py index 7d7455279..e0e0125b4 100644 --- a/medcat-v2/medcat/utils/legacy/conversion_all.py +++ b/medcat-v2/medcat/utils/legacy/conversion_all.py @@ -7,9 +7,11 @@ from medcat.components.types import CoreComponentType from medcat.storage.serialisers import AvailableSerialisers from medcat.components.linking.no_action_linker import NoActionLinker +from medcat.utils.config_utils import temp_changed_config from medcat.utils.legacy.convert_cdb import get_cdb_from_old from medcat.utils.legacy.convert_config import get_config_from_old +from medcat.utils.legacy.convert_config import fix_spacy_model_name from medcat.utils.legacy.convert_vocab import get_vocab_from_old from medcat.utils.legacy.helpers import fix_subnames @@ -67,7 +69,8 @@ def convert(self) -> CAT: CAT: The model pack. """ cdb = get_cdb_from_old( - os.path.join(self.old_model_folder, self.cdb_name)) + os.path.join(self.old_model_folder, self.cdb_name), + fix_spacy_model_name=False) vocab_path = os.path.join(self.old_model_folder, self.vocab_name) if os.path.exists(vocab_path): vocab = get_vocab_from_old(vocab_path) @@ -79,7 +82,14 @@ def convert(self) -> CAT: config = get_config_from_old(cnf_path) else: config = cdb.config - cat = CAT(cdb, vocab, config) + with temp_changed_config( + config.general.nlp, "modelname", + os.path.join(self.old_model_folder, + config.general.nlp.modelname)): + cat = CAT(cdb, vocab, config) + # NOTE: its probably easier if we change the spacy model name + # afterwards + fix_spacy_model_name(config, cat.pipe.tokenizer) fix_subnames(cat) # MetaCATs meta_cat_folders = [ diff --git a/medcat-v2/medcat/utils/legacy/convert_cdb.py b/medcat-v2/medcat/utils/legacy/convert_cdb.py index 119533599..8a57cd7ab 100644 --- a/medcat-v2/medcat/utils/legacy/convert_cdb.py +++ b/medcat-v2/medcat/utils/legacy/convert_cdb.py @@ -5,6 +5,8 @@ from medcat.config import Config from medcat.cdb.concepts import get_new_cui_info, get_new_name_info, TypeInfo from medcat.utils.legacy.convert_config import get_config_from_nested_dict +from medcat.utils.legacy.convert_config import ( + fix_spacy_model_name as apply_spacy_model_fix) logger = logging.getLogger(__name__) @@ -209,11 +211,14 @@ def update_names(cdb: CDB, data: dict): setattr(cdb, name_to, data[name_from]) -def convert_data(all_data: dict) -> CDB: +def convert_data(all_data: dict, fix_spacy_model_name: bool = True) -> CDB: """Convert the raw v1 data into a CDB. Args: all_data (dict): The raw v1 data off disk. + fix_spacy_model_name (bool): Whether to fix the spacy model name. + Older models may have unsuported spacy model names. So these + may sometimes need to be fixed. Defaults to True. Returns: CDB: The v2 CDB. @@ -226,17 +231,23 @@ def convert_data(all_data: dict) -> CDB: if 'config' in all_data: logger.info("Loading old style CDB with config included.") cdb.config = get_config_from_nested_dict(all_data['config']) + if fix_spacy_model_name: + apply_spacy_model_fix(cdb.config) return cdb -def get_cdb_from_old(old_path: str) -> CDB: +def get_cdb_from_old(old_path: str, + fix_spacy_model_name: bool = True) -> CDB: """Get the v2 CDB from a v1 CDB path. Args: old_path (str): The v1 CDB path. + fix_spacy_model_name (bool): Whether to fix the spacy model name. + Older models may have unsuported spacy model names. So these + may sometimes need to be fixed. Defaults to True. Returns: CDB: The v2 CDB. """ data = load_old_raw_data(old_path) - return convert_data(data) + return convert_data(data, fix_spacy_model_name) diff --git a/medcat-v2/medcat/utils/legacy/convert_config.py b/medcat-v2/medcat/utils/legacy/convert_config.py index 3ed0f060a..b1853984c 100644 --- a/medcat-v2/medcat/utils/legacy/convert_config.py +++ b/medcat-v2/medcat/utils/legacy/convert_config.py @@ -8,6 +8,7 @@ from medcat.utils.legacy.helpers import fix_old_style_cnf from medcat.config.config import SerialisableBaseModel +from medcat.tokenizing.tokenizers import BaseTokenizer logger = logging.getLogger(__name__) @@ -126,8 +127,8 @@ def _relocate(cnf: Config, old_data: dict) -> Config: orig_val = cast(Any, orig_val) target_model = cast(BaseModel, target_model) fname = new_path.split(".")[-1] - logger.info("Relocating from %s to %s (%s)", orig_path, new_path, - type(orig_val).__name__) + logger.info("Relocating from %s to %s (%s) [%s]", orig_path, new_path, + type(orig_val).__name__, orig_val) _safe_setattr(target_model, fname, orig_val) return cnf @@ -167,13 +168,25 @@ def get_config_from_nested_dict(old_data: dict) -> Config: # but we now default to regex cnf.general.nlp.provider = 'spacy' cnf = _make_changes(cnf, old_data) + return cnf + + +def fix_spacy_model_name( + cnf: Config, + tokenizer: BaseTokenizer | None = None) -> None: if cnf.general.nlp.modelname in ('spacy_model', 'en_core_sci_md', 'en_core_sci_lg'): logger.info("Fixing spacy model. " "Moving from '%s' to 'en_core_web_md'!", cnf.general.nlp.modelname) cnf.general.nlp.modelname = 'en_core_web_md' - return cnf + # NOTE: the tokenizer uses an internally cached name that we need to + # fix here as well so that the name of the subsequently saved + # files is more descriptive than just 'spacy_model' + if tokenizer: + from medcat.tokenizing.spacy_impl.tokenizers import SpacyTokenizer + cast(SpacyTokenizer, + tokenizer)._spacy_model_name = cnf.general.nlp.modelname def get_config_from_old(path: str) -> Config: