"""Utility classes for mapping aggregating and collating sequence (i.e. NER)
labels.
"""
__author__ = 'Paul Landes'
from typing import Tuple, List
from dataclasses import dataclass, field
import logging
import sys
from io import TextIOBase
from spacy.tokens.doc import Doc
from spacy.tokens import Token
from zensols.persist import persisted, PersistableContainer
from zensols.config import Dictable
from zensols.nlp import FeatureDocument, FeatureToken, FeatureSentence
logger = logging.getLogger(__name__)
[docs]
@dataclass
class SequenceAnnotation(PersistableContainer, Dictable):
"""An annotation of a pair matching feature and spaCy tokens.
"""
label: str = field()
"""The string label of this annotation."""
doc: FeatureDocument = field()
"""The feature document associated with this annotation."""
tokens: Tuple[FeatureToken] = field()
"""The tokens annotated with ``label``."""
@property
@persisted('_sent', transient=True)
def sent(self) -> FeatureSentence:
"""The sentence containing the annotated tokens."""
sents = self.doc.sentences_for_tokens(self.tokens)
assert len(sents) == 1
return sents[0]
@property
@persisted('_token_matches', transient=True)
def token_matches(self) -> Tuple[FeatureToken, Token]:
"""Pairs of matching feature token to token mapping. This is useful for
annotating spaCy documents.
"""
matches = []
sdoc: Doc = self.doc.spacy_doc
tok: FeatureToken
for tok in self.tokens:
stok: Token = sdoc[tok.i]
matches.append((tok, stok))
return tuple(matches)
@property
def mention(self) -> str:
"""The mention text."""
return ' '.join(map(str, self.tokens))
[docs]
def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
short: bool = False):
if short:
s = f'{self.mention}: {self.label} ({self.tokens[0].i})'
self._write_line(s, depth, writer)
else:
self._write_line(f'label: {self.label}', depth, writer)
tok: FeatureToken
for tok in self.tokens:
sent = ''
if hasattr(tok, 'sent_i'):
sent = f'sent index={tok.sent_i}, '
self._write_line(f'{tok.text}: {sent}index in doc={tok.i}',
depth + 1, writer)
def __str__(self):
return f'{self.mention} ({self.label})'
[docs]
@dataclass
class SequenceDocumentAnnotation(Dictable):
"""Contains token annotations for a :class:`~zensols.nlp.FeatureDocument` as a
duple of :class:`.SequenceAnnotation`.
"""
doc: FeatureDocument = field()
"""The feature document associated with this annotation."""
sequence_anons: Tuple[SequenceAnnotation] = field()
"""The annotations for the respective :obj:`doc`."""
@property
def spacy_doc(self) -> Doc:
"""The spaCy document associated with this annotation."""
return self.doc.spacy_doc
@property
@persisted('_token_matches', transient=True)
def token_matches(self) -> Tuple[str, FeatureToken, Token]:
"""Triple of matching feature token to token mapping in the form (``label``,
``feature token``, ``spacy token``). This is useful for annotating
spaCy documents.
"""
matches: List[Tuple[str, Tuple[FeatureToken, Token]]] = []
for sanon in self.sequence_anons:
for tok_matches in sanon.token_matches:
matches.append((sanon.label, *tok_matches))
return tuple(matches)
[docs]
def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
short: bool = False):
self._write_line(f'doc: {self.doc} S={short}', depth, writer)
for anon in self.sequence_anons:
anon.write(depth + 1, writer, short=short)
[docs]
@dataclass
class BioSequenceAnnotationMapper(object):
"""Matches feature documents/tokens with spaCy document/tokens and entity
labels.
"""
begin_tag: str = field(default='B')
"""The sequence ``begin`` tag class."""
in_tag: str = field(default='I')
"""The sequence ``in`` tag class."""
out_tag: str = field(default='O')
"""The sequence ``out`` tag class."""
def _map_entities(self, classes: Tuple[List[str]],
docs: Tuple[FeatureDocument]) -> \
Tuple[str, int, Tuple[int, int]]:
"""Map BIO entities and documents to a pairing of both.
:param classes: the clases (labels, or usually, predictions)
:param docs: the feature documents to assign labels
:return: a tuple of label, sentence index and lexical feature document
index interval of tokens
"""
ents: Tuple[str, int, Tuple[int, int]] = []
doc: FeatureDocument
# tok.i is not reliable since holes exist from filtered space and
# possibly other removed tokens
for six, (cls, doc) in enumerate(zip(classes, docs)):
tok: FeatureToken
start_ix = None
start_lab = None
ent: str
for stix, (ent, tok) in enumerate(zip(cls, doc.tokens)):
pos: int = ent.find('-')
bio, lab = None, None
if pos > -1:
bio, lab = ent[0:pos], ent[pos+1:]
if bio == self.begin_tag:
start_ix = stix
start_lab = lab
if ent == self.out_tag and start_ix is not None:
ents.append((start_lab, six, (start_ix, stix)))
start_ix = None
start_lab = None
return ents
def _collate(self, docs: Tuple[FeatureDocument],
ents: Tuple[str, int, Tuple[int, int]]) -> \
List[SequenceAnnotation]:
"""Collate entity tokens in to groups.
:param docs: the feature documents to assign labels
:param ents: a tuple of label, sentence index and lexical feature
document index interval of tokens
:return: a tuple ``(feature document, label, (start feature token, end
feature token))``
"""
anons: List[SequenceAnnotation] = []
for lab, six, loc in ents:
doc: FeatureDocument = docs[six]
ftoks: Tuple[FeatureToken] = doc.tokens
ent_toks: Tuple[FeatureToken] = ftoks[loc[0]:loc[1]]
anons.append(SequenceAnnotation(lab, doc, ent_toks))
return anons
[docs]
def map(self, classes: Tuple[List[str]],
docs: Tuple[FeatureDocument]) -> Tuple[SequenceDocumentAnnotation]:
"""Map BIO entities and documents to pairings as annotations.
:param classes: a tuple of lists, each list containing the class of the
token in BIO format
:param docs: the feature documents to assign labels
:return: a tuple of annotation instances, each with coupling of label,
feature token and spaCy token
"""
ents: Tuple[str, int, Tuple[int, int]] = \
self._map_entities(classes, docs)
sanons: List[SequenceAnnotation] = self._collate(docs, ents)
col_sanons: List[SequenceAnnotation] = []
danons: List[SequenceDocumentAnnotation] = []
last_doc: FeatureDocument = None
sanon: SequenceAnnotation
for sanon in sanons:
col_sanons.append(sanon)
if last_doc is not None and sanon.doc != last_doc:
danons.append(SequenceDocumentAnnotation(
last_doc, tuple(col_sanons)))
col_sanons.clear()
last_doc = sanon.doc
if len(col_sanons) > 0:
danons.append(SequenceDocumentAnnotation(
last_doc, tuple(col_sanons)))
return danons