Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion medcat-v2/medcat/tokenizing/spacy_impl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,17 @@ def ensure_spacy_model(model_name: str) -> None:
cmd = f"{sys.executable} -m spacy download {model_name}"
logger.info("Installing the spacy model %s using the CLI command "
"'%s'", model_name, cmd)
subprocess.run(cmd.split(" "), check=True)
try:
subprocess.run(
cmd.split(" "), check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as err:
if ("requests.exceptions.ConnectionError" in err.stderr and
"Failed to resolve" in err.stderr):
logger.warning(
"Unable to ensure the existing of spacy model '%s'. "
"Internet seems to be unavailable. If the model "
"does not provide its own implementation (it should), "
"subsequent usage may prove problematic. Underlying error:\n"
"%s", model_name, err.stderr, exc_info=err)
else:
raise err
14 changes: 12 additions & 2 deletions medcat-v2/medcat/utils/legacy/conversion_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from medcat.components.types import CoreComponentType
from medcat.storage.serialisers import AvailableSerialisers
from medcat.components.linking.no_action_linker import NoActionLinker
from medcat.utils.config_utils import temp_changed_config

from medcat.utils.legacy.convert_cdb import get_cdb_from_old
from medcat.utils.legacy.convert_config import get_config_from_old
from medcat.utils.legacy.convert_config import fix_spacy_model_name
from medcat.utils.legacy.convert_vocab import get_vocab_from_old
from medcat.utils.legacy.helpers import fix_subnames

Expand Down Expand Up @@ -67,7 +69,8 @@ def convert(self) -> CAT:
CAT: The model pack.
"""
cdb = get_cdb_from_old(
os.path.join(self.old_model_folder, self.cdb_name))
os.path.join(self.old_model_folder, self.cdb_name),
fix_spacy_model_name=False)
vocab_path = os.path.join(self.old_model_folder, self.vocab_name)
if os.path.exists(vocab_path):
vocab = get_vocab_from_old(vocab_path)
Expand All @@ -79,7 +82,14 @@ def convert(self) -> CAT:
config = get_config_from_old(cnf_path)
else:
config = cdb.config
cat = CAT(cdb, vocab, config)
with temp_changed_config(
config.general.nlp, "modelname",
os.path.join(self.old_model_folder,
config.general.nlp.modelname)):
cat = CAT(cdb, vocab, config)
# NOTE: its probably easier if we change the spacy model name
# afterwards
fix_spacy_model_name(config, cat.pipe.tokenizer)
fix_subnames(cat)
# MetaCATs
meta_cat_folders = [
Expand Down
17 changes: 14 additions & 3 deletions medcat-v2/medcat/utils/legacy/convert_cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from medcat.config import Config
from medcat.cdb.concepts import get_new_cui_info, get_new_name_info, TypeInfo
from medcat.utils.legacy.convert_config import get_config_from_nested_dict
from medcat.utils.legacy.convert_config import (
fix_spacy_model_name as apply_spacy_model_fix)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -209,11 +211,14 @@ def update_names(cdb: CDB, data: dict):
setattr(cdb, name_to, data[name_from])


def convert_data(all_data: dict) -> CDB:
def convert_data(all_data: dict, fix_spacy_model_name: bool = True) -> CDB:
"""Convert the raw v1 data into a CDB.

Args:
all_data (dict): The raw v1 data off disk.
fix_spacy_model_name (bool): Whether to fix the spacy model name.
Older models may have unsuported spacy model names. So these
may sometimes need to be fixed. Defaults to True.

Returns:
CDB: The v2 CDB.
Expand All @@ -226,17 +231,23 @@ def convert_data(all_data: dict) -> CDB:
if 'config' in all_data:
logger.info("Loading old style CDB with config included.")
cdb.config = get_config_from_nested_dict(all_data['config'])
if fix_spacy_model_name:
apply_spacy_model_fix(cdb.config)
return cdb


def get_cdb_from_old(old_path: str) -> CDB:
def get_cdb_from_old(old_path: str,
fix_spacy_model_name: bool = True) -> CDB:
"""Get the v2 CDB from a v1 CDB path.

Args:
old_path (str): The v1 CDB path.
fix_spacy_model_name (bool): Whether to fix the spacy model name.
Older models may have unsuported spacy model names. So these
may sometimes need to be fixed. Defaults to True.

Returns:
CDB: The v2 CDB.
"""
data = load_old_raw_data(old_path)
return convert_data(data)
return convert_data(data, fix_spacy_model_name)
19 changes: 16 additions & 3 deletions medcat-v2/medcat/utils/legacy/convert_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from medcat.utils.legacy.helpers import fix_old_style_cnf
from medcat.config.config import SerialisableBaseModel
from medcat.tokenizing.tokenizers import BaseTokenizer


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -126,8 +127,8 @@ def _relocate(cnf: Config, old_data: dict) -> Config:
orig_val = cast(Any, orig_val)
target_model = cast(BaseModel, target_model)
fname = new_path.split(".")[-1]
logger.info("Relocating from %s to %s (%s)", orig_path, new_path,
type(orig_val).__name__)
logger.info("Relocating from %s to %s (%s) [%s]", orig_path, new_path,
type(orig_val).__name__, orig_val)
_safe_setattr(target_model, fname, orig_val)
return cnf

Expand Down Expand Up @@ -167,13 +168,25 @@ def get_config_from_nested_dict(old_data: dict) -> Config:
# but we now default to regex
cnf.general.nlp.provider = 'spacy'
cnf = _make_changes(cnf, old_data)
return cnf


def fix_spacy_model_name(
cnf: Config,
tokenizer: BaseTokenizer | None = None) -> None:
if cnf.general.nlp.modelname in ('spacy_model', 'en_core_sci_md',
'en_core_sci_lg'):
logger.info("Fixing spacy model. "
"Moving from '%s' to 'en_core_web_md'!",
cnf.general.nlp.modelname)
cnf.general.nlp.modelname = 'en_core_web_md'
return cnf
# NOTE: the tokenizer uses an internally cached name that we need to
# fix here as well so that the name of the subsequently saved
# files is more descriptive than just 'spacy_model'
if tokenizer:
from medcat.tokenizing.spacy_impl.tokenizers import SpacyTokenizer
cast(SpacyTokenizer,
tokenizer)._spacy_model_name = cnf.general.nlp.modelname


def get_config_from_old(path: str) -> Config:
Expand Down
Loading