Source code for zensols.nlp.spannorm

"""Normalize spans (of tokens) into strings by reconstructing based on language
rules from the normalized form of the tokens.  This is needed after any token
manipulation from :class:`.TokenNormalizer` or other changes to
:obj:`.FeatureToken.norm`.

For now, only English is supported, but the module is provided for other
languages and future enhancements of normalization configuration.

"""
__author__ = 'Paul Landes'

from typing import Set, Iterable, Tuple
from dataclasses import dataclass, field
from abc import ABCMeta, abstractmethod
import re
from io import StringIO
from . import ParseError, FeatureToken



[docs]
class SpanNormalizer(metaclass=ABCMeta):
    """Subclasses normalize feature tokens on a per :class:`spacy.Language`.
    All subclasses must be re-entrant.

    """

[docs]
    @abstractmethod
    def get_norm(self, tokens: Iterable[FeatureToken], use_norm: bool) -> str:
        """Create a string that follows the langauge spacing rules.

        :param tokens: the tokens to normalize

        :param use_norm: whether to use the token normalized or orthographic
                         text

        """
        pass



[docs]
    @abstractmethod
    def get_canonical(self, tokens: Iterable[FeatureToken]) -> str:
        """A canonical representation of the container, which are non-space
        tokens separated by :obj:`CANONICAL_DELIMITER`.

        """





[docs]
@dataclass(frozen=True)
class EnglishSpanNormalizer(SpanNormalizer):
    """An implementation of a span normalizer for the Enlish language.

    """
    post_space_skip: Set[str] = field(default=frozenset("""`‘“[({<-"""))
    """Characters after which no space is added for span normalization."""

    pre_space_skip: Set[str] = field(default=frozenset(
        "'s n't 'll 'm 've 'd 're -".split()))
    """Characters before whcih no space is added for span normalization."""

    keep_space_skip: Set[str] = field(default=frozenset("""_"""))
    """Characters that retain space on both sides."""

    canonical_delimiter: str = field(default='|')
    """The token delimiter used in :obj:`canonical`."""

    def __post_init__(self):
        # bypass frozen setattr guards
        self.__dict__['_longest_pre_space_skip'] = \
            max(map(len, self.pre_space_skip))


[docs]
    def get_norm(self, tokens: Iterable[FeatureToken], use_norm: bool) -> str:
        nsent: str
        ws_re: re.Pattern = re.compile(r'\s*\n\s*')
        toks: Tuple[FeatureToken] = tuple(
            filter(lambda t: ws_re.match(t.text) is None, tokens))
        tlen: int = len(toks)
        has_punc = tlen > 0 and hasattr(toks[0], 'is_punctuation')
        if has_punc:
            post_space_skip: Set[str] = self.post_space_skip
            pre_space_skip: Set[str] = self.pre_space_skip
            keep_space_skip: Set[str] = self.keep_space_skip
            n_pre_space_skip: int = self._longest_pre_space_skip
            sio = StringIO()
            last_avoid = False
            tix: int
            tok: FeatureToken
            for tix, tok in enumerate(toks):
                ttext: str = tok.norm if use_norm else tok.text
                if ttext is None:
                    raise ParseError(f'Token {tok.text} has no norm')
                if tix > 0 and tix < tlen:
                    nlen: int = len(ttext)
                    if nlen == 1 and ttext in keep_space_skip:
                        sio.write(' ')
                    else:
                        do_post_space_skip: bool = False
                        if nlen == 1:
                            do_post_space_skip = ttext in post_space_skip
                        if (not tok.is_punctuation or do_post_space_skip) and \
                           not last_avoid and \
                           not (nlen <= n_pre_space_skip and
                                ttext in pre_space_skip):
                            sio.write(' ')
                        last_avoid = do_post_space_skip or ttext == '--'
                sio.write(ttext)
            nsent = sio.getvalue()
        else:
            nsent = ' '.join(map(lambda t: t.norm, toks))
        return nsent.strip()



[docs]
    def get_canonical(self, tokens: Iterable[FeatureToken]) -> str:
        return self.canonical_delimiter.join(
            map(lambda t: t.text,
                filter(lambda t: not t.is_space, tokens)))


    def __getstate__(self):
        raise RuntimeError(f'Instances of {type(self)} are not picklable')



DEFAULT_FEATURE_TOKEN_NORMALIZER = EnglishSpanNormalizer()