"""Normalize spans (of tokens) into strings by reconstructing based on languagerules from the normalized form of the tokens. This is needed after any tokenmanipulation from :class:`.TokenNormalizer` or other changes to:obj:`.FeatureToken.norm`.For now, only English is supported, but the module is provided for otherlanguages and future enhancements of normalization configuration."""__author__='Paul Landes'fromtypingimportSet,Iterable,Tuplefromdataclassesimportdataclass,fieldfromabcimportABCMeta,abstractmethodimportrefromioimportStringIOfrom.importParseError,FeatureToken
[docs]classSpanNormalizer(metaclass=ABCMeta):"""Subclasses normalize feature tokens on a per :class:`spacy.Language`. All subclasses must be re-entrant. """
[docs]@abstractmethoddefget_norm(self,tokens:Iterable[FeatureToken],use_norm:bool)->str:"""Create a string that follows the langauge spacing rules. :param tokens: the tokens to normalize :param use_norm: whether to use the token normalized or orthographic text """pass
[docs]@abstractmethoddefget_canonical(self,tokens:Iterable[FeatureToken])->str:"""A canonical representation of the container, which are non-space tokens separated by :obj:`CANONICAL_DELIMITER`. """
[docs]@dataclass(frozen=True)classEnglishSpanNormalizer(SpanNormalizer):"""An implementation of a span normalizer for the Enlish language. """post_space_skip:Set[str]=field(default=frozenset("""`‘“[({<-"""))"""Characters after which no space is added for span normalization."""pre_space_skip:Set[str]=field(default=frozenset("'s n't 'll 'm 've 'd 're -".split()))"""Characters before whcih no space is added for span normalization."""keep_space_skip:Set[str]=field(default=frozenset("""_"""))"""Characters that retain space on both sides."""canonical_delimiter:str=field(default='|')"""The token delimiter used in :obj:`canonical`."""def__post_init__(self):# bypass frozen setattr guardsself.__dict__['_longest_pre_space_skip']= \
max(map(len,self.pre_space_skip))
[docs]defget_norm(self,tokens:Iterable[FeatureToken],use_norm:bool)->str:nsent:strws_re:re.Pattern=re.compile(r'\s*\n\s*')toks:Tuple[FeatureToken]=tuple(filter(lambdat:ws_re.match(t.text)isNone,tokens))tlen:int=len(toks)has_punc=tlen>0andhasattr(toks[0],'is_punctuation')ifhas_punc:post_space_skip:Set[str]=self.post_space_skippre_space_skip:Set[str]=self.pre_space_skipkeep_space_skip:Set[str]=self.keep_space_skipn_pre_space_skip:int=self._longest_pre_space_skipsio=StringIO()last_avoid=Falsetix:inttok:FeatureTokenfortix,tokinenumerate(toks):ttext:str=tok.normifuse_normelsetok.textifttextisNone:raiseParseError(f'Token {tok.text} has no norm')iftix>0andtix<tlen:nlen:int=len(ttext)ifnlen==1andttextinkeep_space_skip:sio.write(' ')else:do_post_space_skip:bool=Falseifnlen==1:do_post_space_skip=ttextinpost_space_skipif(nottok.is_punctuationordo_post_space_skip)and \
notlast_avoidand \
not(nlen<=n_pre_space_skipandttextinpre_space_skip):sio.write(' ')last_avoid=do_post_space_skiporttext=='--'sio.write(ttext)nsent=sio.getvalue()else:nsent=' '.join(map(lambdat:t.norm,toks))returnnsent.strip()