Source code for zensols.mednlp.resource

"""MedCAT wrapper.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Iterable, Dict, Any, Set
from dataclasses import dataclass, field, InitVar
import logging
import warnings
from pathlib import Path
import re
from frozendict import frozendict
import pandas as pd
import spacy.util
from medcat.config import Config, MixingConfig
from medcat.vocab import Vocab
from medcat.cdb import CDB
from medcat.cat import CAT
from medcat.meta_cat import MetaCAT
from zensols.util import APIError
from zensols.install import Resource, Installer
from zensols.persist import persisted, PersistedWork

logger = logging.getLogger(__name__)


[docs] @dataclass class MedCatResource(object): """A factory class that creates MedCAT resources. """ _MODEL_REGEX = re.compile(r'^([^@]+) @ .+$') """A regular expression for a spaCy model dependency (http syntax).""" installer: Installer = field() """Installs and provides paths to the model files.""" vocab_resource: Resource = field() """The path to the ``vocab.dat`` file.""" cdb_resource: Resource = field() """The ``cdb-medmen-v1.dat`` file. """ mc_status_resource: Resource = field() """The the ``mc_status`` directory. """ umls_tuis: Resource = field() """The UMLS TUIs (types) mapping resource that maps from TUIs to descriptions. :see: `Semantic Types <https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/SemanticTypesAndGroups.html>`_ """ umls_groups: Resource = field() """Like :obj:`umls_tuis` but groups TUIs in gropus.""" filter_tuis: Set[str] = field(default=None) """Types used to filter linked CUIs (i.e. ``{'T047', 'T048'}``). """ filter_groups: Set[str] = field(default=None) """Just like :obj:`filter_tuis` but each element is treated as a group used to generate a list of CUIs from those mapped from ``name`` to ``tui` in :obj:`groups`. """ spacy_enable_components: Set[str] = field( default_factory=lambda: set('sentencizer parser'.split())) """By default, MedCAT disables several pipeline components. Some of these are needed for sentence chunking and other downstream tasks. Otherwise sentence indexing won't work because sentence boundaries are missing. :see: `MedCAT Config <https://github.com/CogStack/MedCAT/blob/master/medcat/config.py>`_ """ cat_config: Dict[str, Dict[str, Any]] = field(default=None) """If provieded, set the CDB configuration. Keys are ``general``, ``preprocessing`` and all other attributes documented in the `MedCAT Config <https://github.com/CogStack/MedCAT/blob/master/medcat/config.py>`_ """ cache_global: InitVar[bool] = field(default=True) """Whether or not to globally cache resources, which saves load time. """ requirements_dir: Path = field(default=None) """The directory with the pip requirements files.""" auto_install_models: Tuple[str, ...] = field(default=()) """A list of spaCy models that will be installed if not already.""" def __post_init__(self, cache_global: bool): self._tuis = PersistedWork('_tuis', self, cache_global=cache_global) self._cat = PersistedWork('_cat', self, cache_global=cache_global) self._installed = False @staticmethod def _filter_medcat_logger(): class NoCdbExportFilter(logging.Filter): def filter(self, record): s = 'The CDB was exported by an unknown version of MedCAT.' return not record.getMessage() == s logging.getLogger('medcat.cdb').addFilter(NoCdbExportFilter()) def _assert_installed(self): if not self._installed: self.installer() self._installed = True def _override_config(self, targ: Config, src: Dict[str, Dict[str, Any]]): src_top: str src_conf = Dict[str, Any] for src_top, src_conf in src.items(): targ_any: Any = getattr(targ, src_top) if logger.isEnabledFor(logging.DEBUG): logger.debug(f"updating dict '{src_top}' ({type(targ_any)}): " + f"<{targ_any}> with <{src_conf}>") if isinstance(targ_any, dict): targ_any.update(src_conf) elif isinstance(targ_any, MixingConfig): targ_any.merge_config(src_conf) else: setattr(targ, src_top, src_conf) def _add_filters(self, config: Config, cdb: CDB): filter_tuis = set() if self.filter_tuis is not None: filter_tuis.update(self.filter_tuis) if self.filter_groups is not None: df: pd.DataFrame = self.groups reg = '.*(' + '|'.join(self.filter_groups) + ')' df = df[df['name'].str.match(reg)] filter_tuis.update(df['tui'].tolist()) if logger.isEnabledFor(logging.INFO): logger.info(f'filtering on tuis: {", ".join(filter_tuis)}') if len(filter_tuis) > 0: cui_filters = set() for tui in filter_tuis: cui_filters.update(cdb.addl_info['type_id2cuis'][tui]) config.linking['filters']['cuis'] = cui_filters @property @persisted('_tuis') def tuis(self) -> Dict[str, str]: """A mapping of type identifiers (TUIs) to their descriptions.""" self._assert_installed() path: Path = self.installer[self.umls_tuis] df = pd.read_csv(path, delimiter='|', header=None) df.columns = 'abbrev tui desc'.split() df_tups = df[['tui', 'desc']].itertuples(name=None, index=False) return frozendict(df_tups) @property @persisted('_groups') def groups(self) -> pd.DataFrame: """A dataframe of TUIs, their abbreviations, descriptions and a group name associated with each. """ self._assert_installed() path: Path = self.installer[self.umls_groups] df = pd.read_csv(path, delimiter='|', header=None) df.columns = 'abbrev name tui desc'.split() return df @property @persisted('_cat') def cat(self) -> CAT: """The MedCAT NER tagger instance. When this property is accessed, all models are downloaded first, then loaded, if not already. """ self._assert_installed() # Load the vocab model you downloaded vocab = Vocab.load(self.installer[self.vocab_resource]) # Load the cdb model you downloaded cdb = CDB.load(self.installer[self.cdb_resource]) # mc status model mc_status = MetaCAT.load(self.installer[self.mc_status_resource]) # enable sentence boundary annotation for name in self.spacy_enable_components: cdb.config.general['spacy_disabled_components'].remove(name) # override configuration if self.cat_config is not None: self._override_config(cdb.config, self.cat_config) # add TUI filters (i.e. filter out non-medical terms) self._add_filters(cdb.config, cdb) # ensure models are installed self._assert_spacy_models() # create cat - each cdb comes with a config that was used to train it; # you can change that config in any way you want, before or after # creating cat try: cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=[mc_status]) except OSError as e: msg: str = str(e) if msg.find("Can't find model") == -1: raise e else: logger.info('no scispacy model found--attempting to install') self._install_model() cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab, meta_cats=[mc_status]) return cat def _install_model(self): if self.requirements_dir is None: raise APIError('model not installed and no requirements found') else: from pip._internal import main as pipmain req_file: Path for req_file in self.requirements_dir.iterdir(): pipmain(['install', '--use-deprecated=legacy-resolver', '-r', str(req_file), '--no-deps']) def _get_model_requirements(self) -> Iterable[Tuple[str, str]]: path: Path for path in self.requirements_dir.iterdir(): with open(path) as f: line: str for line in map(str.strip, f.readlines()): m: re.Match = self._MODEL_REGEX.match(line) if m is not None: yield (m.group(1), line) def _install_dependency(self, dep: str): from pip._internal import main as pipmain pipmain(['install', '--use-deprecated=legacy-resolver', dep, '--no-deps']) def _assert_spacy_models(self): missing: Set[str] = set() model_name: str for model_name in self.auto_install_models: if not spacy.util.is_package(model_name): missing.add(model_name) if len(missing) > 0: if logger.isEnabledFor(logging.INFO): logger.info(f'installing missing models: {missing}') reqs: Dict[str, str] = dict(self._get_model_requirements()) for model_name in missing: dep: str = reqs.get(model_name) if dep is None: raise APIError( f'Resource needs unmapped model: {model_name}') self._install_dependency(dep)
[docs] def clear(self): self._tuis.clear() self._cat.clear()
MedCatResource._filter_medcat_logger()