"""Contains useful classes for decorating feature sentences."""__author__='Paul Landes'fromtypingimportList,Tuple,Set,Dict,Anyfromdataclassesimportdataclass,fieldimportrefrom.import(NLPError,LexicalSpan,FeatureToken,TokenContainer,FeatureSentence,FeatureDocument,FeatureTokenContainerDecorator,FeatureSentenceDecorator,FeatureDocumentDecorator)
[docs]@dataclassclassSplitTokenSentenceDecorator(FeatureSentenceDecorator):"""A decorator that splits feature tokens by white space. """def_split_tok(self,ftok:FeatureToken,matches:Tuple[re.Match]):toks:List[FeatureToken]=[]formatchinmatches:ctok:FeatureToken=ftok.clone()ctok.norm=match.group(0)ctok.lexspan=LexicalSpan(ftok.lexspan.begin+match.start(0),ftok.lexspan.begin+match.end(0))ctok.idx=ctok.lexspan.begintoks.append(ctok)returntoks
[docs]@dataclassclassFilterTokenSentenceDecorator(FeatureSentenceDecorator):"""A decorator that strips whitespace from sentences. :see: :meth:`.TokenContainer.strip` """remove_stop:bool=field(default=False)"""Whether to remove stop words."""remove_space:bool=field(default=False)"""Whether to remove white space (i.e. new lines)."""remove_pronouns:bool=field(default=False)"""Whether to remove pronouns (i.e. ``he``)."""remove_punctuation:bool=field(default=False)"""Whether to remove punctuation (i.e. periods)."""remove_determiners:bool=field(default=False)"""Whether to remove determiners (i.e. ``the``)."""remove_empty:bool=field(default=False)"""Whether to 0-length tokens (using normalized text)."""
[docs]@dataclassclassFilterEmptySentenceDocumentDecorator(FeatureDocumentDecorator):"""Filter zero length sentences. """filter_space:bool=field(default=True)"""Whether to filter space tokens when comparing zero length sentences."""def_filter_empty_sentences(self,fsent:FeatureSentence)->bool:toks:Tuple[FeatureToken]=fsent.tokensifself.filter_space:toks=tuple(filter(lambdat:nott.is_space,fsent.token_iter()))returnlen(toks)>0
[docs]@dataclassclassUpdateTokenContainerDecorator(FeatureTokenContainerDecorator):"""Updates document indexes and spans (see fields). """update_indexes:bool=field(default=True)"""Whether to update the document indexes with :meth:`.FeatureDocument.update_indexes`. """update_entity_spans:bool=field(default=True)"""Whether to update the document indexes with :meth:`.FeatureDocument.update_entity_spans`. """reindex:bool=field(default=False)"""Whether to invoke :meth:`TokenContainer.reindex` after."""
[docs]@dataclassclassCopyFeatureTokenContainerDecorator(FeatureTokenContainerDecorator):"""Copies feature(s) for each token in the container. For each token, each source / target tuple pair in :obj:`feature_ids` is copied. If the feature is missing (does not include for existing :obj:`.FeatureToken.NONE` values) an exception is raised. """feature_ids:Tuple[Tuple[str,str],...]=field()"""The features to copy in the form ((`<source>`, `<target>`), ...)."""
[docs]defdecorate(self,container:TokenContainer):fids:Tuple[Tuple[str,str],...]=self.feature_idstok:FeatureTokenfortokincontainer.token_iter():source:strtarget:strforsource,targetinfids:ifnothasattr(tok,source):raiseNLPError(f"Missing feature ID '{source}' for token {tok}")tok.set_feature(target,getattr(tok,source))
[docs]@dataclassclassRemoveFeatureTokenContainerDecorator(FeatureTokenContainerDecorator):"""Removes features each token in the container. """exclude_feature_ids:Set[str]=field()"""The features to remove from the tokens."""