Source code for zensols.nlp.domain

"""Interfaces, contracts and errors.

"""
from __future__ import annotations
__author__ = 'Paul Landes'
from typing import (
    Tuple, Union, Optional, ClassVar, Set, Iterable, List, Type, TYPE_CHECKING
)
if TYPE_CHECKING:
    from .tok import FeatureToken
from abc import ABCMeta
import sys
from io import TextIOBase
import textwrap as tw
from spacy.tokens import Token
from spacy.tokens import Span
from spacy.tokens import Doc
from zensols.util import APIError
from zensols.config import Dictable



[docs]
class NLPError(APIError):
    """Raised for any errors for this library."""
    pass




[docs]
class ParseError(APIError):
    """Raised for any parsing errors."""
    pass




[docs]
class MissingFeatureError(NLPError):
    """Raised on attempting to access a non-existant feature in
    :class:`.FeatureToken`.

    """

[docs]
    def __init__(self, token: FeatureToken, feature_id: str, msg: str = None):
        """Initialize.

        :param token: the token for which access was attempted

        :param feature_id: the feature_id that is missing in ``token``

        """
        s = f"Missing feature '{feature_id}' in token {token}"
        if msg is not None:
            s = f'{s} {msg}'
        super().__init__(s)
        self.token = token
        self.feature_id = feature_id





[docs]
class LexicalSpan(Dictable):
    """A lexical character span of text in a document.  The span has two
    positions: :obj:`begin` and :obj:`end`, which is indexed respectively as an
    operator as well.  The left (:obj:`begin) is inclusive and the right
    (:obj:`end`) is exclusive to conform to Python array slicing conventions.

    One span is less than the other when the beginning position is less.  When
    the beginnign positions are the same, the one with the smaller end position
    is less.

    The length of the span is the distance between the end and the beginning
    positions.

    """
    _DICTABLE_ATTRIBUTES: ClassVar[Set[str]] = {'begin', 'end'}

    EMPTY_SPAN: ClassVar[LexicalSpan] = None
    """The span ``(0, 0)``."""


[docs]
    def __init__(self, begin: int, end: int):
        """Initialize the interval.

        :param begin: the begin of the span

        :param end: the end of the span

        """
        self.begin = begin
        self.end = end


    @property
    def astuple(self) -> Tuple[int, int]:
        """The span as a ``(begin, end)`` tuple."""
        return (self.begin, self.end)


[docs]
    @classmethod
    def from_tuples(cls: Type, tups: Iterable[Tuple[int, int]]) -> \
            Iterable[LexicalSpan]:
        """Create spans from tuples.

        :param tups: an iterable of ``(<begin>, <end)`` tuples

        """
        return map(lambda t: cls(*t), tups)



[docs]
    @classmethod
    def from_token(cls, tok: Union[Token, Span]) -> Tuple[int, int]:
        """Create a span from a spaCy :class:`~spacy.tokens.Token` or
        :class:`~spacy.tokens.Span`.

        """
        if isinstance(tok, Span):
            doc: Doc = tok.doc
            etok = doc[tok.end - 1]
            start = doc[tok.start].idx
            end = etok.idx + len(etok.orth_)
        else:
            start = tok.idx
            end = tok.idx + len(tok.orth_)
        return cls(start, end)



[docs]
    @staticmethod
    def overlaps(a0: int, a1: int, b0: int, b1: int, inclusive: bool = True):
        """Return whether or not one text span overlaps with another.

        :param inclusive: whether to check include +1 on the end component

        :return: any overlap detected returns ``True``

        """
        if inclusive:
            m = (a0 <= b0 and a1 >= b0) or (b0 <= a0 and b1 >= a0)
        else:
            m = (a0 <= b0 and a1 > b0) or (b0 <= a0 and b1 > a0)
        return m



[docs]
    def overlaps_with(self, other: LexicalSpan,
                      inclusive: bool = True) -> bool:
        """Return whether or not one text span overlaps non-inclusively with
        another.

        :param other: the other location

        :param inclusive: whether to check include +1 on the end component

        :return: any overlap detected returns ``True``

        """
        return self.overlaps(
            self.begin, self.end, other.begin, other.end, inclusive)



[docs]
    def narrow(self, other: LexicalSpan) -> Optional[LexicalSpan]:
        """Return the shortest span that inclusively fits in both this and
        ``other``.

        :param other: the second span to narrow with this span

        :retun: a span so that beginning is maximized and end is minimized or
                ``None`` if the two spans do not overlap

        """
        nar: LexicalSpan = None
        if self.overlaps_with(other):
            beg = max(self.begin, other.begin)
            end = min(self.end, other.end)
            if beg == self.begin and end == self.end:
                nar = self
            elif beg == other.begin and end == other.end:
                nar = other
            else:
                nar = LexicalSpan(beg, end)
        return nar



[docs]
    @staticmethod
    def widen(others: Iterable[LexicalSpan]) -> Optional[LexicalSpan]:
        """Take the span union by using the left most :obj:`begin` and the right
        most :obj:`end`.

        :param others: the spans to union

        :return: the widest span that inclusively aggregates ``others``, or None
                 if an empty sequence is passed

        """
        begs = sorted(others, key=lambda s: s.begin)
        if len(begs) > 0:
            ends = sorted(begs, key=lambda s: s.end)
            return LexicalSpan(begs[0].begin, ends[-1].end)



[docs]
    @staticmethod
    def gaps(spans: Iterable[LexicalSpan], end: Optional[int] = None) -> \
            List[LexicalSpan]:
        """Return the spans for the "holes" in ``spans``.  For example, if
        ``spans`` is ``((0, 5), (10, 12), (15, 17))``, then return ``((5, 10),
        (12, 15))``.

        :param spans: the spans used to find gaps

        :param end: an end position for the last gap so that if the last item in
                    ``spans`` end does not match, another is added

        :return: a list of spans that "fill" any holes in ``spans``

        """
        spans: List[LexicalSpan] = sorted(spans)
        gaps: List[LexicalSpan] = []
        spiter: Iterable[LexicalSpan] = iter(spans)
        last: LexicalSpan = next(spiter)
        if last.begin > 0:
            last = LexicalSpan(0, last.begin)
            gaps.append(last)
            spiter = iter(spans)
        ns: LexicalSpan
        for ns in spiter:
            gap: int = ns.begin - last.end
            if gap > 0:
                gs = LexicalSpan(last.end, ns.begin)
                gaps.append(gs)
            last = ns
        # add ending span if the last didn't cover it
        if end is not None and end > last.end:
            gaps.append(LexicalSpan(last.end, end))
        return gaps



[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        self._write_line(str(self), depth, writer)


    def _from_dictable(self, *args, **kwargs):
        # prettier printing
        return dict(super()._from_dictable(*args, **kwargs))

    def __eq__(self, other: LexicalSpan) -> bool:
        if self is other:
            return True
        return isinstance(other, LexicalSpan) and \
            self.begin == other.begin and self.end == other.end

    def __lt__(self, other):
        if self.begin == other.begin:
            return self.end < other.end
        else:
            return self.begin < other.begin

    def __hash__(self) -> int:
        return hash(self.begin) + (13 * hash(self.end))

    def __setattr__(self, name, value):
        if hasattr(self, 'end'):
            raise AttributeError(f'{self.__class__.__name__} is immutable')
        super().__setattr__(name, value)

    def __getitem__(self, ix: int) -> int:
        if ix == 0:
            return self.begin
        elif ix == 1:
            return self.end
        raise KeyError(f'LexicalSpan index: {ix}')

    def __len__(self) -> int:
        return self.end - self.begin

    def __str__(self) -> str:
        return f'({self.begin}, {self.end})'

    def __repr__(self):
        return self.__str__()



LexicalSpan.EMPTY_SPAN = LexicalSpan(0, 0)



[docs]
class TextContainer(Dictable, metaclass=ABCMeta):
    """A *writable* class that has a ``text`` property or attribute.  All
    subclasses need a ``norm`` attribute or property.

    """
    _DEFAULT_TOSTR_LEN: ClassVar[str] = 80
    """Default length of string when rendering :meth:`__str__`."""


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
              include_original: bool = True, include_normalized: bool = True):
        if (include_original or include_normalized) and self.text == self.norm:
            self._write_line(f'[T]: {self.text}', depth, writer)
        else:
            if include_original:
                self._write_line(f'[O]: {self.text}', depth, writer)
            if include_normalized:
                self._write_line(f'[N]: {self.norm}', depth, writer)


    def __str__(self):
        return f'<{tw.shorten(self.norm, width=self._DEFAULT_TOSTR_LEN-2)}>'

    def __repr__(self):
        return self.__str__()