Source code for zensols.nlp.spannorm
"""Normalize spans (of tokens) into strings by reconstructing based on language
rules from the normalized form of the tokens. This is needed after any token
manipulation from :class:`.TokenNormalizer` or other changes to
:obj:`.FeatureToken.norm`.
For now, only English is supported, but the module is provided for other
languages and future enhancements of normalization configuration.
"""
__author__ = 'Paul Landes'
from typing import Set, Iterable, Tuple
from dataclasses import dataclass, field
from abc import ABCMeta, abstractmethod
from io import StringIO
from . import ParseError, FeatureToken
[docs]
class SpanNormalizer(metaclass=ABCMeta):
"""Subclasses normalize feature tokens on a per :class:`spacy.Language`.
All subclasses must be re-entrant.
"""
[docs]
@abstractmethod
def get_norm(self) -> str:
"""Create a string that follows the langauge spacing rules."""
pass
[docs]
@abstractmethod
def get_canonical(self, tokens: Iterable[FeatureToken]) -> str:
"""A canonical representation of the container, which are non-space
tokens separated by :obj:`CANONICAL_DELIMITER`.
"""
[docs]
@dataclass(frozen=True)
class EnglishSpanNormalizer(SpanNormalizer):
"""An implementation of a span normalizer for the Enlish language.
"""
post_space_skip: Set[str] = field(default=frozenset("""`‘“[({<-"""))
"""Characters after which no space is added for span normalization."""
pre_space_skip: Set[str] = field(default=frozenset(
"'s n't 'll 'm 've 'd 're -".split()))
"""Characters before whcih no space is added for span normalization."""
keep_space_skip: Set[str] = field(default=frozenset("""_"""))
"""Characters that retain space on both sides."""
canonical_delimiter: str = field(default='|')
"""The token delimiter used in :obj:`canonical`."""
def __post_init__(self):
# bypass frozen setattr guards
self.__dict__['_longest_pre_space_skip'] = \
max(map(len, self.pre_space_skip))
[docs]
def get_norm(self, tokens: Iterable[FeatureToken]) -> str:
nsent: str
toks: Tuple[FeatureToken] = tuple(
filter(lambda t: t.text != '\n', tokens))
tlen: int = len(toks)
has_punc = tlen > 0 and hasattr(toks[0], 'is_punctuation')
if has_punc:
post_space_skip: Set[str] = self.post_space_skip
pre_space_skip: Set[str] = self.pre_space_skip
keep_space_skip: Set[str] = self.keep_space_skip
n_pre_space_skip: int = self._longest_pre_space_skip
sio = StringIO()
last_avoid = False
tix: int
tok: FeatureToken
for tix, tok in enumerate(toks):
norm: str = tok.norm
if norm is None:
raise ParseError(f'Token {tok.text} has no norm')
if tix > 0 and tix < tlen:
nlen: int = len(norm)
if nlen == 1 and norm in keep_space_skip:
sio.write(' ')
else:
do_post_space_skip: bool = False
if nlen == 1:
do_post_space_skip = norm in post_space_skip
if (not tok.is_punctuation or do_post_space_skip) and \
not last_avoid and \
not (nlen <= n_pre_space_skip and
norm in pre_space_skip):
sio.write(' ')
last_avoid = do_post_space_skip or tok.norm == '--'
sio.write(norm)
nsent = sio.getvalue()
else:
nsent = ' '.join(map(lambda t: t.norm, toks))
return nsent.strip()
[docs]
def get_canonical(self, tokens: Iterable[FeatureToken]) -> str:
return self.canonical_delimiter.join(
map(lambda t: t.text,
filter(lambda t: not t.is_space, tokens)))
def __getstate__(self):
raise RuntimeError(f'Instances of {type(self)} are not picklable')
DEFAULT_FEATURE_TOKEN_NORMALIZER = EnglishSpanNormalizer()