Source code for zensols.mimic.tokenizer

"""Modify the spaCy parser configuration to deal with the MIMIC-III dataset.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Union, Optional, ClassVar, List
from dataclasses import dataclass, field
import logging
import re
from frozendict import frozendict
from spacy.language import Language
from spacy.lang.char_classes import ALPHA
from spacy.util import compile_infix_regex
from zensols.nlp import Component, FeatureTokenDecorator, FeatureToken

logger = logging.getLogger(__name__)



[docs]
@dataclass
class MimicTokenizerComponent(Component):
    """Modifies the spacCy tokenizer to split on colons (``:``) to capture more
    MIMIC-III mask tokens.

    """

[docs]
    def init(self, model: Language):
        inf = list(model.Defaults.infixes)
        SCHARS = ',:;/=@#%+.-'
        # split on newlines; handle newline as an infix token
        inf.insert(0, r'\n')
        # split on special characters before
        inf.insert(1, r"(?<=\*\*\])(?:[{s}])(?=[{a}0-9])".format(
            a=ALPHA, s=SCHARS))
        inf.insert(2, r"(?<=\*\*\])(?=[{a}0-9])".format(a=ALPHA))
        # split on special characters after
        inf.insert(3, r"(?<=[{a}0-9])(?:[{s}])(?=\[\*\*)".format(
            a=ALPHA, s=SCHARS))
        inf.insert(4, r"(?<=[{a}0-9])(?=\[\*\*)".format(a=ALPHA))
        # split on what look to be ranges or hospital1-hospital2
        inf.insert(3, r"(?<=\*\*\])(?:[{s}])(?=\[\*\*)".format(s=SCHARS))
        infix_re = compile_infix_regex(inf)
        model.tokenizer.infix_finditer = infix_re.finditer


    def __hash__(self) -> int:
        return super().__hash__()




[docs]
@dataclass
class MimicTokenDecorator(FeatureTokenDecorator):
    """Contains the MIMIC-III regular expressions and other patterns to annotate
    and normalized feature tokens.  The class finds mask tokens and
    separators (such as a long string of dashes or asterisks).

    Attribute :obj:`onto_mapping` is a mapping from the MIMIC symbol in
    :obj:`token_entities` (2nd value in tuple) to Onto Notes 5, which is used as
    the NER symbol in spaCy.

    """
    TOKEN_FEATURE_ID: ClassVar[str] = 'mimic_'
    """The feature ID to use for MIMIC-III tokens."""

    ONTO_FEATURE_ID: ClassVar[str] = 'onto_'
    """The feature ID to use for the Onto Notes 5 (:obj:`onto_mapping`)."""

    MASK_REGEX: ClassVar[re.Pattern] = re.compile(r'\[\*\*([^\*]+)\*\*\]')
    """Matches mask tokens."""

    MASK_TOKEN_FEATURE: ClassVar[str] = 'mask'
    """The value given from entity :obj:`TOKEN_FEATURE_ID` for mask tokens
    (i.e. ``[**First Name**]``).

    """
    SEPARATOR_TOKEN_FEATURE: ClassVar[str] = 'separator'
    """The value name of separators defined by :obj:`SEP_REGEX`.

    """
    SEP_REGEX: ClassVar[re.Pattern] = re.compile(r'(_{5,}|[*]{5,}|[-]{5,})')
    """Matches text based separators such as a long string of dashes."""

    UNKNOWN_ENTITY: ClassVar[str] = '<UNKNOWN>'
    """The mask nromalized token form for unknown MIMIC entity text
    (i.e. First Name).

    """
    _REGEXES: ClassVar[List] = [[MASK_REGEX, MASK_TOKEN_FEATURE],
                                [SEP_REGEX, SEPARATOR_TOKEN_FEATURE]]

    token_entities: Tuple[Tuple[Union[re.Pattern, str]], str, Optional[str]] = \
        field(default=(
            (re.compile(r'^First Name'), 'FIRSTNAME', 'PERSON'),
            (re.compile(r'^Last Name'), 'LASTNAME', 'PERSON'),
            (re.compile(r'^21\d{2}-\d{1,2}-\d{1,2}$'), 'DATE', 'DATE')))
    """A list of psuedo token patterns and a string to replace with the
    respective match.

    """
    token_replacements: Tuple[Tuple[Union[re.Pattern, str], str]] = field(
        default=())
    """A list of token text to replaced as the normalized token text."""

    def __post_init__(self):
        self.onto_mapping = {}
        self._compile_regexes('token_entities')
        self._compile_regexes('token_replacements')
        self.onto_mapping = frozendict(self.onto_mapping)

    def _compile_regexes(self, attr: str):
        repls = []
        ent: str
        pat: Union[re.Pattern, str]
        for pat, ent, onto_name in getattr(self, attr):
            if isinstance(pat, str):
                pat = re.compile(pat)
            repls.append((pat, ent))
            if onto_name is not None:
                self.onto_mapping[ent] = onto_name
        setattr(self, attr, tuple(repls))


[docs]
    def decorate(self, token: FeatureToken):
        pat: re.Pattern
        ent: str
        oid: str = FeatureToken.NONE
        matched: bool = False
        for pat, ent in self._REGEXES:
            m: re.Match = pat.match(token.norm)
            if m is not None:
                matched = True
                setattr(token, self.TOKEN_FEATURE_ID, ent)
                if ent == self.MASK_TOKEN_FEATURE:
                    token.norm: str = self.UNKNOWN_ENTITY
                    mask_val: str = m.group(1)
                    for regex, repl in self.token_entities:
                        if regex.match(mask_val) is not None:
                            oid = self.onto_mapping.get(repl, FeatureToken.NONE)
                            if logger.isEnabledFor(logging.DEBUG):
                                logger.debug(f'dec: {self.TOKEN_FEATURE_ID} ' +
                                             f' -> {ent}, norm -> {mask_val}')
                            token.norm = repl
                            break
                break
        if not matched:
            setattr(token, self.TOKEN_FEATURE_ID,
                    FeatureToken.NONE)
            repl: str
            for pat, repl in self.token_replacements:
                m: re.Match = pat.match(token.norm)
                if m is not None:
                    matched = True
                    token.norm = repl
                    break
        setattr(token, self.ONTO_FEATURE_ID, oid)