"""Feature token and related base classes"""from__future__importannotations__author__='Paul Landes'fromtypingimport(List,Tuple,Set,Iterable,Dict,Sequence,Union,Optional,Any,ClassVar,Type,)fromdataclassesimportdataclass,fieldfromfunctoolsimportreducefromitertoolsimportchainimportsysfromioimportTextIOBasefromfrozendictimportfrozendictfromspacy.tokens.tokenimportTokenfromspacy.tokens.docimportDocfromspacy.tokens.spanimportSpanfromzensols.persistimportPersistableContainerfrom.importNLPError,MissingFeatureError,TextContainer,LexicalSpan
[docs]@dataclassclassFeatureToken(PersistableContainer,TextContainer):"""A container class for features about a token. Subclasses such as :class:`.SpacyFeatureToken` extracts only a subset of features from the heavy Spacy C data structures and is hard/expensive to pickle. Instances of this token class are almost always *detached*, meaning the underlying in memory data structures have been copied as pure Python types to facilitate serialization of spaCy tokens. **Feature note**: features :obj:`i`, :obj:`idx` and :obj:`i_sent` are always added to features tokens to be able to reconstruct sentences (see :meth:`.FeatureDocument.uncombine_sentences`), and alwyas included. """_DICTABLE_WRITABLE_DESCENDANTS:ClassVar[bool]=True"""Use write method."""_PERSITABLE_PROPERTIES:ClassVar[Set[str]]={'lexspan'}"""Cache lexspan in pickled files."""REQUIRED_FEATURE_IDS:ClassVar[Set[str]]=frozenset('i idx i_sent norm lexspan'.split())"""Features retained regardless of configuration for basic functionality. """FEATURE_IDS_BY_TYPE:ClassVar[Dict[str,Set[str]]]=frozendict({'bool':frozenset(('is_space is_stop is_ent is_wh is_contraction '+'is_superlative is_pronoun').split()),'int':frozenset(('i idx i_sent sent_i is_punctuation tag '+'ent ent_iob dep shape norm_len').split()),'str':frozenset(('norm lemma_ tag_ pos_ ent_ ent_iob_ '+'dep_ shape_').split()),'list':frozenset('children'.split()),'object':frozenset('lexspan'.split())})"""Map of class type to set of feature IDs."""TYPES_BY_FEATURE_ID:ClassVar[Dict[str,str]]=frozendict(chain.from_iterable(map(lambdaitm:map(lambdaf:(f,itm[0]),itm[1]),FEATURE_IDS_BY_TYPE.items())))"""A map of feature ID to string type. This is used by :meth:`.FeatureToken.write_attributes` to dump the type features. """FEATURE_IDS:ClassVar[Set[str]]=frozenset(reduce(lambdares,x:res|x,FEATURE_IDS_BY_TYPE.values()))"""All default available feature IDs."""SKIP_COMPARE_FEATURE_IDS:ClassVar[Set[str]]=set()"""A set of feature IDs to avoid comparing in :meth:`__eq__`."""WRITABLE_FEATURE_IDS:ClassVar[Tuple[str,...]]=tuple(('text norm idx sent_i i i_sent tag pos '+'is_wh entity dep children').split())"""Feature IDs that are dumped on :meth:`write` and :meth:`write_attributes`. """NONE:ClassVar[str]='-<N>-'"""Default string for *not a feature*, or missing features."""i:int=field()"""The index of the token within the parent document."""idx:int=field()"""The character offset of the token within the parent document."""i_sent:int=field()"""The index of the token within the parent sentence. The index of the token in the respective sentence. This is not to be confused with the index of the sentence to which the token belongs, which is :obj:`sent_i`. """norm:str=field()"""Normalized text, which is the text/orth or the named entity if tagged as a named entity. """lexspan:LexicalSpan=field()"""The character offset beginning and end of the token. This is set as (``start``, ``end``) as (:obj:`idx`, :obj:`idx` + ``len(text)``). The ``begin`` is usually the same as :obj:`idx` but can be updated when updated for normalized text or when the text moves/reindexed in the document. """def__post_init__(self):super().__init__()self._detached_feature_ids=None#self._lexspan = LexicalSpan(self.idx, self.idx + len(self.text))
[docs]defdetach(self,feature_ids:Set[str]=None,skip_missing:bool=False,cls:Type[FeatureToken]=None)->FeatureToken:"""Create a detected token (i.e. from spaCy artifacts). :param feature_ids: the features to write, which defaults to :obj:`FEATURE_IDS` :param skip_missing: whether to only keep ``feature_ids`` :param cls: the type of the new instance """cls=FeatureTokenifclsisNoneelseclsiffeature_idsisNone:feature_ids=set(self.FEATURE_IDS)else:feature_ids=set(feature_ids)feature_ids.update(self.REQUIRED_FEATURE_IDS)feats:Dict[str,Any]=self.get_features(feature_ids,skip_missing)clone=FeatureToken.__new__(cls)clone.__dict__.update(feats)ifhasattr(self,'_text'):clone.text=self._textiffeature_idsisnotNone:clone._detached_feature_ids=feature_idsreturnclone
@propertydefis_detached(self)->bool:"""Whether this token has been detached."""returnself._detached_feature_idsisnotNone@propertydefdefault_detached_feature_ids(self)->Optional[Set[str]]:"""The default set of feature IDs used when cloning or detaching with :meth:`clone` or :meth:`detach`. """returnself._detached_feature_ids@default_detached_feature_ids.setterdefdefault_detached_feature_ids(self,feature_ids:Set[str]):"""The default set of feature IDs used when cloning or detaching with :meth:`clone` or :meth:`detach`. """self._detached_feature_ids=feature_ids
[docs]defclone(self,cls:Type=None,**kwargs)->FeatureToken:"""Clone an instance of this token. :param cls: the type of the new instance :param kwargs: arguments to add to as attributes to the clone :return: the cloned instance of this instance """clone=self.detach(self._detached_feature_ids,cls=cls)clone.__dict__.update(kwargs)returnclone
@propertydeftext(self)->str:"""The initial text before normalized by any :class:`.TokenNormalizer`. """ifhasattr(self,'_text'):returnself._textelse:returnself.norm@text.setterdeftext(self,text:str):"""The initial text before normalized by any :class:`.TokenNormalizer`. """self._text=text@propertydefis_none(self)->bool:"""Return whether or not this token is represented as none or empty."""returnself._is_none(self.norm)@classmethoddef_is_none(cls,targ:Any)->bool:returntargisNoneortarg==cls.NONEortarg==0
[docs]defget_feature(self,feature_id:str,expect:bool=True,check_none:bool=False,message:str=None)-> \
Optional[Any]:"""Return a feature by the feature ID. :param feature_id: the ID of the feature to retrieve :param expect: whether to raise an error :param message: additional context to append to the error message :param check_none: whether to return the value even if it has an unset value such as :obj:`NONE` as determined by :meth:`is_none`, in which case ``None`` is returned :raises MissingFeatureError: if ``expect`` is ``True`` and the feature does not exist """val:Any=Nonehas_attr:bool=hasattr(self,feature_id)ifnothas_attr:ifexpect:raiseMissingFeatureError(self,feature_id,message)else:val=getattr(self,feature_id)ifcheck_noneandself._is_none(val):val=Nonereturnval
[docs]defset_feature(self,feature_id:str,value:Any):"""Set, or add if non-existant, a feature to this token instance. If the token has been detached, it will be added to the :obj:`default_detached_feature_ids`. :param feature_id: the ID of the feature to set :param value: the new or replaced value of the feature """setattr(self,feature_id,value)ifself._detached_feature_idsisnotNone:self._detached_feature_ids.add(feature_id)
[docs]defget_features(self,feature_ids:Iterable[str]=None,skip_missing:bool=False)->Dict[str,Any]:"""Get features as a :class:`dict`. :param feature_ids: the features to write, which defaults to :obj:`FEATURE_IDS` :param skip_missing: whether to only keep ``feature_ids`` """feature_ids=self.FEATURE_IDSiffeature_idsisNoneelsefeature_idsifskip_missing:feature_ids=filter(lambdafid:hasattr(self,fid),feature_ids)return{k:getattr(self,k)forkinfeature_ids}
[docs]defsplit(self,positions:Iterable[int])->List[FeatureToken]:"""Split on text normal index positions. This needs and updates the ``idx`` and ``lexspan`` atttributes. :param positions: 0-indexes into :obj:`norm` indicating where to split :return: new (cloned) tokens along the boundaries of ``positions`` """splits:List[FeatureToken]=[]norms:List[str]=[]idx:int=self.idxstart:int=0end:intforendinpositions:norms.append((start,self.norm[start:end]))start=endnorms.append((start,self.norm[start:]))norm:strforstart,norminnorms:offset:int=idx+startsplit_tok=self.clone()split_tok.norm=normsplit_tok.idx=offsetsplit_tok.lexspan=LexicalSpan(offset,offset+len(norm))splits.append(split_tok)returnsplits
[docs]defto_vector(self,feature_ids:Sequence[str]=None)->Iterable[str]:"""Return an iterable of feature data. """iffeature_idsisNone:feature_ids=set(self.__dict__.keys())- \
{'_detached_feature_ids'}returnmap(lambdaa:getattr(self,a),sorted(feature_ids))
[docs]defwrite_attributes(self,depth:int=0,writer:TextIOBase=sys.stdout,include_type:bool=True,feature_ids:Iterable[str]=None,inline:bool=False,include_none:bool=True):"""Write feature attributes. :param depth: the starting indentation depth :param writer: the writer to dump the content of this writable :param include_type: if ``True`` write the type of value (if available) :param feature_ids: the features to write, which defaults to :obj:`WRITABLE_FEATURE_IDS` :param inline: whether to print attributes all on the same line """iffeature_idsisNone:feature_ids=self._detached_feature_idsiffeature_idsisNone:feature_ids=self.WRITABLE_FEATURE_IDSdct=self.get_features(feature_ids,True)if'text'indctanddct['norm']==dct['text']:deldct['text']fori,kinenumerate(sorted(dct.keys())):val:str=dct[k]ptype:str=Noneifnotinclude_noneandself._is_none(val):continueifinclude_type:ptype=self.TYPES_BY_FEATURE_ID.get(k)ifptypeisnotNone:ptype=f' ({ptype})'ptype=''ifptypeisNoneelseptypesout=f'{k}={val}{ptype}'ifinline:ifi==0:writer.write(self._sp(depth))else:writer.write(', ')writer.write(sout)else:self._write_line(sout,depth,writer)ifinline:self._write_empty(writer)
def__eq__(self,other:FeatureToken)->bool:ifselfisother:returnTrueifself.i==other.iandself.idx==other.idx:a=dict(self.__dict__)b=dict(other.__dict__)dela['_detached_feature_ids']delb['_detached_feature_ids']forattrinself.SKIP_COMPARE_FEATURE_IDS:a.pop(attr,None)b.pop(attr,None)returna==breturnFalsedef__lt__(self,other:FeatureToken)->int:returnself.idx<other.idxdef__hash__(self)->int:return((self.i+1)*13)+ \
((self.idx+1)*29)+ \
((self.i_sent+1)*71)def__str__(self)->str:returnTextContainer.__str__(self)def__repr__(self)->str:returnself.__str__()# speed up none compares by using interned NONEdef__getstate__(self)->Dict[str,Any]:state=super().__getstate__()ifself.norm==self.NONE:delstate['norm']returnstate# speed up none compares by using interned NONEdef__setstate__(self,state:Dict[str,Any]):if'norm'notinstate:state['norm']=self.NONEsuper().__setstate__(state)
[docs]@dataclass(init=False)classSpacyFeatureToken(FeatureToken):"""Contains and provides the same features as a spaCy :class:`~spacy.tokens.Token`. """spacy_token:Union[Token,Span]=field(repr=False,compare=False)"""The parsed spaCy token (or span if entity) this feature set is based. :see: :meth:`.FeatureDocument.spacy_doc` """
def__getstate__(self):raiseNLPError('Not persistable')@propertydeftoken(self)->Token:"""Return the SpaCy token. """tok=self.spacy_tokenifisinstance(tok,Span):tok=self._doc[tok.start]returntok@propertydefis_wh(self)->bool:"""Return ``True`` if this is a WH word (i.e. what, where). """returnself.token.tag_.startswith('W')@propertydefis_stop(self)->bool:"""Return ``True`` if this is a stop word. """returnnotself.is_entandself.token.is_stop@propertydefis_punctuation(self)->bool:"""Return ``True`` if this is a punctuation (i.e. '?') token. """returnself.token.is_punct@propertydefis_pronoun(self)->bool:"""Return ``True`` if this is a pronoun (i.e. 'he') token. """returnFalseifself.is_entelseself.spacy_token.pos_=='PRON'@staticmethoddef_is_apos(tok:Token)->bool:"""Return whether or not ``tok`` is an apostrophy (') symbol. :param tok: the token to copmare """return(tok.orth!=tok.lemma_)and(tok.orth_.find('\'')>=0)@propertydeflemma_(self)->str:"""Return the string lemma or text of the named entitiy if tagged as a named entity. """returnself.spacy_token.orth_ifself.is_ent \
elseself.spacy_token.lemma_@propertydefis_contraction(self)->bool:"""Return ``True`` if this token is a contradiction. """ifself.is_ent:returnFalseelse:t=self.spacy_tokenifself._is_apos(t):returnTrueelse:doc=t.docdl=len(doc)return((t.i+1)<dl)andself._is_apos(doc[t.i+1])@propertydefent(self)->int:"""Return the entity numeric value or 0 if this is not an entity. """returnself.spacy_token.labelifself.is_entelse0@propertydefent_(self)->str:"""Return the entity string label or ``None`` if this token has no entity. """returnself.spacy_token.label_ifself.is_entelseself.NONE@propertydefent_iob(self)->int:"""Return the entity IOB tag, which ``I`` for in, ```O`` for out, `B`` for begin. """returnself.token.ent_iobifself.is_entelse0@propertydefent_iob_(self)->str:"""Return the entity IOB nominal index for :obj:``ent_iob``. """returnself.token.ent_iob_ifself.is_entelse'O'
[docs]defconll_iob_(self)->str:"""Return the CoNLL formatted IOB tag, such as ``B-ORG`` for a beginning organization token. """ifnotself.is_ent:return'O'returnf'{self.self.token.ent_iob_}-{self.token.ent_type_}'
@propertydefis_superlative(self)->bool:"""Return ``True`` if this token is the superlative. """returnself.token.tag_=='JJS'@propertydefis_space(self)->bool:"""Return ``True`` if this token is white space only. """returnself.token.is_space@propertydefsent_i(self)->int:"""The index of the sentence to which the token belongs. This is not to be confused with the index of the token in the respective sentence, which is :obj:`.FeatureToken.i_sent`. This attribute does not exist in a spaCy token, and was named as such to follow the naming conventions of their API. """targ=self.iforsix,sentinenumerate(self._doc.sents):fortokinsent:iftok.i==targ:returnsix@propertydeftag(self)->int:"""Fine-grained part-of-speech text. """returnself.token.tag@propertydeftag_(self)->str:"""Fine-grained part-of-speech text. """returnself.token.tag_@propertydefpos(self)->int:"""The simple UPOS part-of-speech tag. """returnself.token.pos@propertydefpos_(self)->str:"""The simple UPOS part-of-speech tag. """returnself.token.pos_@propertydefshape(self)->int:"""Transform of the tokens’s string, to show orthographic features. For example, “Xxxx” or “d. """returnself.token.shape@propertydefshape_(self)->str:"""Transform of the tokens’s string, to show orthographic features. For example, “Xxxx” or “d. """returnself.token.shape_@propertydefchildren(self):"""A sequence of the token’s immediate syntactic children. """return[c.iforcinself.token.children]@propertydefdep(self)->int:"""Syntactic dependency relation. """returnself.token.dep@propertydefdep_(self)->str:"""Syntactic dependency relation string representation. """returnself.token.dep_@propertydefnorm_len(self)->int:"""The length of the norm in characters."""returnlen(self.norm)def__str__(self):ifhasattr(self,'spacy_token'):tokstr=self.spacy_tokenelse:tokstr=self.normreturnf'{tokstr} ({self.norm})'