Source code for zensols.nlp.decorate

"""Contains useful classes for decorating feature sentences.

"""
__author__ = 'Paul Landes'

from typing import List, Tuple, Set, Dict, Any
from dataclasses import dataclass, field
import re
from . import (
    NLPError, LexicalSpan, FeatureToken, TokenContainer,
    FeatureSentence, FeatureDocument, FeatureTokenContainerDecorator,
    FeatureSentenceDecorator, FeatureDocumentDecorator
)



[docs]
@dataclass
class SplitTokenSentenceDecorator(FeatureSentenceDecorator):
    """A decorator that splits feature tokens by white space.

    """
    def _split_tok(self, ftok: FeatureToken, matches: Tuple[re.Match]):
        toks: List[FeatureToken] = []
        for match in matches:
            ctok: FeatureToken = ftok.clone()
            ctok.norm = match.group(0)
            ctok.lexspan = LexicalSpan(ftok.lexspan.begin + match.start(0),
                                       ftok.lexspan.begin + match.end(0))
            ctok.idx = ctok.lexspan.begin
            toks.append(ctok)
        return toks


[docs]
    def decorate(self, sent: FeatureSentence):
        split_toks: List[FeatureToken] = []
        for ftok in sent.token_iter():
            tnorms: Tuple[str, ...] = tuple(re.finditer(r'\S+', ftok.norm))
            if len(tnorms) == 1:
                split_toks.append(ftok)
            else:
                split_toks.extend(self._split_tok(ftok, tnorms))
        if sent.token_len != len(split_toks):
            sent.tokens = tuple(split_toks)





[docs]
@dataclass
class StripTokenContainerDecorator(FeatureTokenContainerDecorator):
    """A decorator that strips whitespace from sentences (or
    :class:`.TokenContainer`).

    :see: :meth:`.TokenContainer.strip`

    """

[docs]
    def decorate(self, container: TokenContainer):
        container.strip()





[docs]
@dataclass
class FilterTokenSentenceDecorator(FeatureSentenceDecorator):
    """A decorator that strips whitespace from sentences.

    :see: :meth:`.TokenContainer.strip`

    """
    remove_stop: bool = field(default=False)
    """Whether to remove stop words."""

    remove_space: bool = field(default=False)
    """Whether to remove white space (i.e. new lines)."""

    remove_pronouns: bool = field(default=False)
    """Whether to remove pronouns (i.e. ``he``)."""

    remove_punctuation: bool = field(default=False)
    """Whether to remove punctuation (i.e. periods)."""

    remove_determiners: bool = field(default=False)
    """Whether to remove determiners (i.e. ``the``)."""

    remove_empty: bool = field(default=False)
    """Whether to 0-length tokens (using normalized text)."""


[docs]
    def decorate(self, sent: FeatureSentence):
        def filter_tok(t: FeatureToken) -> bool:
            return \
                (not self.remove_stop or not t.is_stop) and \
                (not self.remove_space or not t.is_space) and \
                (not self.remove_pronouns or not t.pos_ == 'PRON') and \
                (not self.remove_punctuation or not t.is_punctuation) and \
                (not self.remove_determiners or not t.tag_ == 'DT') and \
                (not self.remove_empty or len(t.norm) > 0)
        toks: Tuple[FeatureToken] = tuple(filter(filter_tok, sent))
        if sent.token_len != len(toks):
            sent.tokens = toks





[docs]
@dataclass
class FilterEmptySentenceDocumentDecorator(FeatureDocumentDecorator):
    """Filter zero length sentences.

    """
    filter_space: bool = field(default=True)
    """Whether to filter space tokens when comparing zero length sentences."""

    def _filter_empty_sentences(self, fsent: FeatureSentence) -> bool:
        toks: Tuple[FeatureToken] = fsent.tokens
        if self.filter_space:
            toks = tuple(filter(lambda t: not t.is_space, fsent.token_iter()))
        return len(toks) > 0


[docs]
    def decorate(self, doc: FeatureDocument):
        olen: int = len(doc)
        fsents: Tuple[FeatureSentence] = tuple(filter(
            self._filter_empty_sentences, doc.sents))
        nlen: int = len(fsents)
        if olen != nlen:
            doc.sents = fsents





[docs]
@dataclass
class UpdateTokenContainerDecorator(FeatureTokenContainerDecorator):
    """Updates document indexes and spans (see fields).

    """
    update_indexes: bool = field(default=True)
    """Whether to update the document indexes with
    :meth:`.FeatureDocument.update_indexes`.

    """
    update_entity_spans: bool = field(default=True)
    """Whether to update the document indexes with
    :meth:`.FeatureDocument.update_entity_spans`.

    """
    reindex: bool = field(default=False)
    """Whether to invoke :meth:`TokenContainer.reindex` after."""


[docs]
    def decorate(self, container: TokenContainer):
        if self.update_indexes:
            container.update_indexes()
        if self.update_entity_spans:
            container.update_entity_spans()
        if self.reindex:
            container.reindex()





[docs]
@dataclass
class CopyFeatureTokenContainerDecorator(FeatureTokenContainerDecorator):
    """Copies feature(s) for each token in the container.  For each token, each
    source / target tuple pair in :obj:`feature_ids` is copied.  If the feature
    is missing (does not include for existing :obj:`.FeatureToken.NONE` values)
    an exception is raised.

    """
    feature_ids: Tuple[Tuple[str, str], ...] = field()
    """The features to copy in the form ((`<source>`, `<target>`), ...)."""


[docs]
    def decorate(self, container: TokenContainer):
        fids: Tuple[Tuple[str, str], ...] = self.feature_ids
        tok: FeatureToken
        for tok in container.token_iter():
            source: str
            target: str
            for source, target in fids:
                if not hasattr(tok, source):
                    raise NLPError(
                        f"Missing feature ID '{source}' for token {tok}")
                tok.set_feature(target, getattr(tok, source))





[docs]
@dataclass
class RemoveFeatureTokenContainerDecorator(FeatureTokenContainerDecorator):
    """Removes features each token in the container.

    """
    exclude_feature_ids: Set[str] = field()
    """The features to remove from the tokens."""


[docs]
    def decorate(self, container: TokenContainer):
        rm_fids: Tuple[Tuple[str, str], ...] = self.exclude_feature_ids
        tok: FeatureToken
        for tok in container.token_iter():
            td: Dict[str, Any] = tok.__dict__
            fid: str
            for fid in rm_fids:
                del td[fid]