"""Contains a base class for vectorizers for indexing document."""__author__='Paul Landes'fromtypingimportTuple,Iterable,AnyfromabcimportABC,ABCMeta,abstractmethodfromdataclassesimportdataclass,fieldimportloggingfromitertoolsimportchainfrompathlibimportPathfromzensols.utilimporttimefromzensols.persistimport(persisted,PersistedWork,PersistableContainer,Primeable)fromzensols.nlpimportFeatureToken,FeatureDocumentfromzensols.deepnlp.vectorizeimportFeatureDocumentVectorizerlogger=logging.getLogger(__name__)
[docs]@dataclassclassIndexedDocumentFactory(ABC):"""Creates training documents used to generate indexed features (i.e. latent dirichlet allocation, latent semantic indexing etc). :see: :class:`.DocumentIndexVectorizer` """
[docs]@abstractmethoddefcreate_training_docs(self)->Iterable[FeatureDocument]:"""Create the documents used to index in the model during training. """pass
[docs]@dataclassclassDocumentIndexVectorizer(FeatureDocumentVectorizer,PersistableContainer,Primeable,metaclass=ABCMeta):"""A vectorizer that generates vectorized features based on the index documents of the training set. For example, latent dirichlet allocation maybe be used to generated a distrubiton of likelihood a document belongs to a topic. Subclasses of this abstract class are both vectorizers and models. The model created once, and then cached. To clear the cache and force it to be retrained, use :meth:`clear`. The method :meth:`_create_model` must be implemented. :see: :class:`.TopicModelDocumentIndexerVectorizer` .. document private functions .. automethod:: _create_model """doc_factory:IndexedDocumentFactory=field()"""The document factor used to create training documents for the model vectorizer. """index_path:Path=field()"""The path to the pickeled cache file of the trained model. """def__post_init__(self):PersistableContainer.__init__(self)self.index_path.parent.mkdir(parents=True,exist_ok=True)self._model=PersistedWork(self.index_path,self)
[docs]@staticmethoddeffeat_to_tokens(docs:Tuple[FeatureDocument,...])->Tuple[str,...]:"""Create a tuple of string tokens from a set of documents suitable for document indexing. The strings are the lemmas of the tokens. **Important**: this method must remain static since the LSI instance of this class uses it as a factory function in the a vectorizer. """deffilter_tok(t:FeatureToken)->bool:returnnott.is_spaceandnott.is_stopandnott.is_punctuationtoks=map(lambdad:d.lemma_.lower(),filter(filter_tok,chain.from_iterable(map(lambdad:d.tokens,docs))))returntuple(toks)
[docs]@abstractmethoddef_create_model(self,docs:Iterable[FeatureDocument])->Any:"""Create the model for this indexer. The model is implementation specific. The model must be pickelabel and is cached in as :obj:`model`. """pass
@property@persisted('_model')defmodel(self):"""Return the trained model for this vectorizer. See the class docs on how it is cached and cleared. """docs:Iterable[FeatureDocument]= \
self.doc_factory.create_training_docs()withtime('trained model'):iflogger.isEnabledFor(logging.INFO):logger.info(f'creating model at {self.index_path}')returnself._create_model(docs)def__getstate__(self):returnself.__dict__