Source code for zensols.deepnlp.model.sequence

"""Utility classes for mapping aggregating and collating sequence (i.e. NER)
labels.

"""
__author__ = 'Paul Landes'

from typing import Tuple, List
from dataclasses import dataclass, field
import logging
import sys
from io import TextIOBase
from spacy.tokens.doc import Doc
from spacy.tokens import Token
from zensols.persist import persisted, PersistableContainer
from zensols.config import Dictable
from zensols.nlp import FeatureDocument, FeatureToken, FeatureSentence

logger = logging.getLogger(__name__)



[docs]
@dataclass
class SequenceAnnotation(PersistableContainer, Dictable):
    """An annotation of a pair matching feature and spaCy tokens.

    """
    label: str = field()
    """The string label of this annotation."""

    doc: FeatureDocument = field()
    """The feature document associated with this annotation."""

    tokens: Tuple[FeatureToken] = field()
    """The tokens annotated with ``label``."""

    @property
    @persisted('_sent', transient=True)
    def sent(self) -> FeatureSentence:
        """The sentence containing the annotated tokens."""
        sents = self.doc.sentences_for_tokens(self.tokens)
        assert len(sents) == 1
        return sents[0]

    @property
    @persisted('_token_matches', transient=True)
    def token_matches(self) -> Tuple[FeatureToken, Token]:
        """Pairs of matching feature token to token mapping.  This is useful for
        annotating spaCy documents.

        """
        matches = []
        sdoc: Doc = self.doc.spacy_doc
        tok: FeatureToken
        for tok in self.tokens:
            stok: Token = sdoc[tok.i]
            matches.append((tok, stok))
        return tuple(matches)

    @property
    def mention(self) -> str:
        """The mention text."""
        return ' '.join(map(str, self.tokens))


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
              short: bool = False):
        if short:
            s = f'{self.mention}: {self.label} ({self.tokens[0].i})'
            self._write_line(s, depth, writer)
        else:
            self._write_line(f'label: {self.label}', depth, writer)
            tok: FeatureToken
            for tok in self.tokens:
                sent = ''
                if hasattr(tok, 'sent_i'):
                    sent = f'sent index={tok.sent_i}, '
                self._write_line(f'{tok.text}: {sent}index in doc={tok.i}',
                                 depth + 1, writer)


    def __str__(self):
        return f'{self.mention} ({self.label})'




[docs]
@dataclass
class SequenceDocumentAnnotation(Dictable):
    """Contains token annotations for a :class:`~zensols.nlp.FeatureDocument` as a
    duple of :class:`.SequenceAnnotation`.

    """
    doc: FeatureDocument = field()
    """The feature document associated with this annotation."""

    sequence_anons: Tuple[SequenceAnnotation] = field()
    """The annotations for the respective :obj:`doc`."""

    @property
    def spacy_doc(self) -> Doc:
        """The spaCy document associated with this annotation."""
        return self.doc.spacy_doc

    @property
    @persisted('_token_matches', transient=True)
    def token_matches(self) -> Tuple[str, FeatureToken, Token]:
        """Triple of matching feature token to token mapping in the form (``label``,
        ``feature token``, ``spacy token``).  This is useful for annotating
        spaCy documents.

        """
        matches: List[Tuple[str, Tuple[FeatureToken, Token]]] = []
        for sanon in self.sequence_anons:
            for tok_matches in sanon.token_matches:
                matches.append((sanon.label, *tok_matches))
        return tuple(matches)


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
              short: bool = False):
        self._write_line(f'doc: {self.doc} S={short}', depth, writer)
        for anon in self.sequence_anons:
            anon.write(depth + 1, writer, short=short)





[docs]
@dataclass
class BioSequenceAnnotationMapper(object):
    """Matches feature documents/tokens with spaCy document/tokens and entity
    labels.

    """
    begin_tag: str = field(default='B')
    """The sequence ``begin`` tag class."""

    in_tag: str = field(default='I')
    """The sequence ``in`` tag class."""

    out_tag: str = field(default='O')
    """The sequence ``out`` tag class."""

    def _map_entities(self, classes: Tuple[List[str]],
                      docs: Tuple[FeatureDocument]) -> \
            Tuple[str, int, Tuple[int, int]]:
        """Map BIO entities and documents to a pairing of both.

        :param classes: the clases (labels, or usually, predictions)

        :param docs: the feature documents to assign labels

        :return: a tuple of label, sentence index and lexical feature document
                 index interval of tokens

        """
        ents: Tuple[str, int, Tuple[int, int]] = []
        doc: FeatureDocument
        # tok.i is not reliable since holes exist from filtered space and
        # possibly other removed tokens
        for six, (cls, doc) in enumerate(zip(classes, docs)):
            tok: FeatureToken
            start_ix = None
            start_lab = None
            ent: str
            for stix, (ent, tok) in enumerate(zip(cls, doc.tokens)):
                pos: int = ent.find('-')
                bio, lab = None, None
                if pos > -1:
                    bio, lab = ent[0:pos], ent[pos+1:]
                    if bio == self.begin_tag:
                        start_ix = stix
                        start_lab = lab
                if ent == self.out_tag and start_ix is not None:
                    ents.append((start_lab, six, (start_ix, stix)))
                    start_ix = None
                    start_lab = None
        return ents

    def _collate(self, docs: Tuple[FeatureDocument],
                 ents: Tuple[str, int, Tuple[int, int]]) -> \
            List[SequenceAnnotation]:
        """Collate entity tokens in to groups.

        :param docs: the feature documents to assign labels

        :param ents: a tuple of label, sentence index and lexical feature
                     document index interval of tokens

        :return: a tuple ``(feature document, label, (start feature token, end
                 feature token))``

        """
        anons: List[SequenceAnnotation] = []
        for lab, six, loc in ents:
            doc: FeatureDocument = docs[six]
            ftoks: Tuple[FeatureToken] = doc.tokens
            ent_toks: Tuple[FeatureToken] = ftoks[loc[0]:loc[1]]
            anons.append(SequenceAnnotation(lab, doc, ent_toks))
        return anons


[docs]
    def map(self, classes: Tuple[List[str]],
            docs: Tuple[FeatureDocument]) -> Tuple[SequenceDocumentAnnotation]:
        """Map BIO entities and documents to pairings as annotations.

        :param classes: a tuple of lists, each list containing the class of the
                        token in BIO format

        :param docs: the feature documents to assign labels

        :return: a tuple of annotation instances, each with coupling of label,
                 feature token and spaCy token

        """
        ents: Tuple[str, int, Tuple[int, int]] = \
            self._map_entities(classes, docs)
        sanons: List[SequenceAnnotation] = self._collate(docs, ents)
        col_sanons: List[SequenceAnnotation] = []
        danons: List[SequenceDocumentAnnotation] = []
        last_doc: FeatureDocument = None
        sanon: SequenceAnnotation
        for sanon in sanons:
            col_sanons.append(sanon)
            if last_doc is not None and sanon.doc != last_doc:
                danons.append(SequenceDocumentAnnotation(
                    last_doc, tuple(col_sanons)))
                col_sanons.clear()
            last_doc = sanon.doc
        if len(col_sanons) > 0:
            danons.append(SequenceDocumentAnnotation(
                last_doc, tuple(col_sanons)))
        return danons