Source code for zensols.deepnlp.index.domain

"""Contains a base class for vectorizers for indexing document.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Iterable, Any
from abc import ABC, ABCMeta, abstractmethod
from dataclasses import dataclass, field
import logging
from itertools import chain
from pathlib import Path
from zensols.util import time
from zensols.persist import (
    persisted,
    PersistedWork,
    PersistableContainer,
    Primeable
)
from zensols.nlp import FeatureToken, FeatureDocument
from zensols.deepnlp.vectorize import FeatureDocumentVectorizer

logger = logging.getLogger(__name__)


[docs] @dataclass class IndexedDocumentFactory(ABC): """Creates training documents used to generate indexed features (i.e. latent dirichlet allocation, latent semantic indexing etc). :see: :class:`.DocumentIndexVectorizer` """
[docs] @abstractmethod def create_training_docs(self) -> Iterable[FeatureDocument]: """Create the documents used to index in the model during training. """ pass
[docs] @dataclass class DocumentIndexVectorizer(FeatureDocumentVectorizer, PersistableContainer, Primeable, metaclass=ABCMeta): """A vectorizer that generates vectorized features based on the index documents of the training set. For example, latent dirichlet allocation maybe be used to generated a distrubiton of likelihood a document belongs to a topic. Subclasses of this abstract class are both vectorizers and models. The model created once, and then cached. To clear the cache and force it to be retrained, use :meth:`clear`. The method :meth:`_create_model` must be implemented. :see: :class:`.TopicModelDocumentIndexerVectorizer` .. document private functions .. automethod:: _create_model """ doc_factory: IndexedDocumentFactory = field() """The document factor used to create training documents for the model vectorizer. """ index_path: Path = field() """The path to the pickeled cache file of the trained model. """ def __post_init__(self): PersistableContainer.__init__(self) self.index_path.parent.mkdir(parents=True, exist_ok=True) self._model = PersistedWork(self.index_path, self)
[docs] @staticmethod def feat_to_tokens(docs: Tuple[FeatureDocument, ...]) -> Tuple[str, ...]: """Create a tuple of string tokens from a set of documents suitable for document indexing. The strings are the lemmas of the tokens. **Important**: this method must remain static since the LSI instance of this class uses it as a factory function in the a vectorizer. """ def filter_tok(t: FeatureToken) -> bool: return not t.is_space and not t.is_stop and not t.is_punctuation toks = map(lambda d: d.lemma_.lower(), filter(filter_tok, chain.from_iterable( map(lambda d: d.tokens, docs)))) return tuple(toks)
[docs] @abstractmethod def _create_model(self, docs: Iterable[FeatureDocument]) -> Any: """Create the model for this indexer. The model is implementation specific. The model must be pickelabel and is cached in as :obj:`model`. """ pass
@property @persisted('_model') def model(self): """Return the trained model for this vectorizer. See the class docs on how it is cached and cleared. """ docs: Iterable[FeatureDocument] = \ self.doc_factory.create_training_docs() with time('trained model'): if logger.isEnabledFor(logging.INFO): logger.info(f'creating model at {self.index_path}') return self._create_model(docs) def __getstate__(self): return self.__dict__
[docs] def prime(self): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'priming {self}') self.model
[docs] def clear(self): self._model.clear()