Source code for zensols.mednlp.parser
"""Medical langauge parser.
"""
__author__ = 'Paul Landes'
from typing import Type, Iterable, Dict, Set
from dataclasses import dataclass, field
import logging
import collections
import textwrap as tw
from spacy.tokens.doc import Doc
from spacy.language import Language
from zensols.nlp import FeatureToken, FeatureDocumentParser
from zensols.nlp.sparser import SpacyFeatureDocumentParser
from . import MedNLPError, MedCatResource, MedicalFeatureToken
from .domain import _MedicalEntity
logger = logging.getLogger(__name__)
[docs]
@dataclass
class MedCatFeatureDocumentParser(SpacyFeatureDocumentParser):
"""A medical based language resources that parses concepts.
"""
TOKEN_FEATURE_IDS = frozenset(FeatureDocumentParser.TOKEN_FEATURE_IDS |
MedicalFeatureToken.FEATURE_IDS)
"""Default token feature ID set for the medical parser.
"""
token_feature_ids: Set[str] = field(default=TOKEN_FEATURE_IDS)
"""The features to keep from spaCy tokens.
:see: :obj:`TOKEN_FEATURE_IDS`
"""
token_class: Type[FeatureToken] = field(default=MedicalFeatureToken)
"""The class to use for instances created by :meth:`features`."""
medcat_resource: MedCatResource = field(default=None)
"""The MedCAT factory resource."""
def __post_init__(self):
if self.medcat_resource is None:
raise MedNLPError('No medcat resource set')
super().__post_init__()
def _create_model_key(self) -> str:
return f'name-{self.name}'
def _create_model(self) -> Language:
return self.medcat_resource.cat.pipe.spacy_nlp
def _normalize_tokens(self, doc: Doc) -> Iterable[FeatureToken]:
if logger.isEnabledFor(logging.INFO):
logger.info(f'parsing: {tw.shorten(str(doc), 60)}')
# load/create model resources
res: MedCatResource = self.medcat_resource
ix2ent: Dict[int, _MedicalEntity] = \
collections.defaultdict(_MedicalEntity)
# add entities
for ent in doc.ents:
for i in range(ent.start, ent.end):
tok = doc[i]
ix2ent[tok.idx].concept_span = ent
if logger.isEnabledFor(logging.DEBUG):
logger.debug(f'normalizing with: {self.token_normalizer}')
return super()._normalize_tokens(doc, res=res, ix2ent=ix2ent)