Source code for zensols.nlp.spannorm
"""Normalize spans (of tokens) into strings by reconstructing based on language
rules from the normalized form of the tokens. This is needed after any token
manipulation from :class:`.TokenNormalizer` or other changes to
:obj:`.FeatureToken.norm`.
For now, only English is supported, but the module is provided for other
languages and future enhancements of normalization configuration.
"""
__author__ = 'Paul Landes'
from typing import Set, Iterable, Tuple
from dataclasses import dataclass, field
from abc import ABCMeta, abstractmethod
import re
from io import StringIO
from . import ParseError, FeatureToken
[docs]
class SpanNormalizer(metaclass=ABCMeta):
"""Subclasses normalize feature tokens on a per :class:`spacy.Language`.
All subclasses must be re-entrant.
"""
[docs]
@abstractmethod
def get_norm(self, tokens: Iterable[FeatureToken], use_norm: bool) -> str:
"""Create a string that follows the langauge spacing rules.
:param tokens: the tokens to normalize
:param use_norm: whether to use the token normalized or orthographic
text
"""
pass
[docs]
@abstractmethod
def get_canonical(self, tokens: Iterable[FeatureToken]) -> str:
"""A canonical representation of the container, which are non-space
tokens separated by :obj:`CANONICAL_DELIMITER`.
"""
[docs]
@dataclass(frozen=True)
class EnglishSpanNormalizer(SpanNormalizer):
"""An implementation of a span normalizer for the Enlish language.
"""
post_space_skip: Set[str] = field(default=frozenset("""`‘“[({<-"""))
"""Characters after which no space is added for span normalization."""
pre_space_skip: Set[str] = field(default=frozenset(
"'s n't 'll 'm 've 'd 're -".split()))
"""Characters before whcih no space is added for span normalization."""
keep_space_skip: Set[str] = field(default=frozenset("""_"""))
"""Characters that retain space on both sides."""
canonical_delimiter: str = field(default='|')
"""The token delimiter used in :obj:`canonical`."""
def __post_init__(self):
# bypass frozen setattr guards
self.__dict__['_longest_pre_space_skip'] = \
max(map(len, self.pre_space_skip))
[docs]
def get_norm(self, tokens: Iterable[FeatureToken], use_norm: bool) -> str:
nsent: str
ws_re: re.Pattern = re.compile(r'\s*\n\s*')
toks: Tuple[FeatureToken] = tuple(
filter(lambda t: ws_re.match(t.text) is None, tokens))
tlen: int = len(toks)
has_punc = tlen > 0 and hasattr(toks[0], 'is_punctuation')
if has_punc:
post_space_skip: Set[str] = self.post_space_skip
pre_space_skip: Set[str] = self.pre_space_skip
keep_space_skip: Set[str] = self.keep_space_skip
n_pre_space_skip: int = self._longest_pre_space_skip
sio = StringIO()
last_avoid = False
tix: int
tok: FeatureToken
for tix, tok in enumerate(toks):
ttext: str = tok.norm if use_norm else tok.text
if ttext is None:
raise ParseError(f'Token {tok.text} has no norm')
if tix > 0 and tix < tlen:
nlen: int = len(ttext)
if nlen == 1 and ttext in keep_space_skip:
sio.write(' ')
else:
do_post_space_skip: bool = False
if nlen == 1:
do_post_space_skip = ttext in post_space_skip
if (not tok.is_punctuation or do_post_space_skip) and \
not last_avoid and \
not (nlen <= n_pre_space_skip and
ttext in pre_space_skip):
sio.write(' ')
last_avoid = do_post_space_skip or ttext == '--'
sio.write(ttext)
nsent = sio.getvalue()
else:
nsent = ' '.join(map(lambda t: t.norm, toks))
return nsent.strip()
[docs]
def get_canonical(self, tokens: Iterable[FeatureToken]) -> str:
return self.canonical_delimiter.join(
map(lambda t: t.text,
filter(lambda t: not t.is_space, tokens)))
def __getstate__(self):
raise RuntimeError(f'Instances of {type(self)} are not picklable')
DEFAULT_FEATURE_TOKEN_NORMALIZER = EnglishSpanNormalizer()