Source code for zensols.deepnlp.vectorize.embed

"""This file contains a stash used to load an embedding layer.  It creates
features in batches of matrices and persists matrix only (sans features) for
efficient retrival.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Iterable, List, Union
from dataclasses import dataclass, field
import logging
from itertools import chain
import torch
from torch import Tensor
from zensols.config import Dictable
from zensols.persist import persisted, Primeable
from zensols.deeplearn.vectorize import FeatureContext, TensorFeatureContext
from zensols.nlp import FeatureToken, FeatureDocument, FeatureSentence
from zensols.deepnlp.embed import WordEmbedModel
from zensols.deepnlp.vectorize import TextFeatureType
from . import FoldingDocumentVectorizer

logger = logging.getLogger(__name__)



[docs]
@dataclass
class EmbeddingFeatureVectorizer(FoldingDocumentVectorizer,
                                 Primeable, Dictable):
    """Vectorize a :class:`~zensols.nlp.container.FeatureDocument` as a vector
    of embedding indexes.  Later, these indexes are used in a
    :class:`~zensols.deepnlp.layer.embed.EmbeddingLayer` to create the input
    word embedding during execution of the model.

    """
    embed_model: Union[WordEmbedModel, 'TransformerEmbedding'] = field()
    """The word vector model.

    Types for this value include:

      * :class:`~znesols.deepnlp.embed.domain.WordEmbedModel`
      * :class:`~zensols.deepnlp.transformer.embed.TransformerEmbedding`

    """
    decode_embedding: bool = field(default=False)
    """Whether or not to decode the embedding during the decode phase, which is
    helpful when caching batches; otherwise, the data is decoded from indexes
    to embeddings each epoch.

    Note that this option and functionality can not be obviated by that
    implemented with the :obj:`encode_transformed` attribute.  The difference
    is over whether or not more work is done on during decoding rather than
    encoding.  An example of when this is useful is for large word embeddings
    (i.e. Google 300D pretrained) where the the index to tensor embedding
    transform is done while decoding rather than in the `forward` so it's not
    done for every epoch.

    """
    def _get_shape(self) -> Tuple[int, int]:
        return self.manager.token_length, self.embed_model.vector_dimension


[docs]
    def prime(self):
        if isinstance(self.embed_model, Primeable):
            self.embed_model.prime()


    def _get_dictable_attributes(self) -> Iterable[Tuple[str, str]]:
        return chain.from_iterable(
            [super()._get_dictable_attributes(), [('model', 'embed_model')]])




[docs]
@dataclass
class WordVectorEmbeddingFeatureVectorizer(EmbeddingFeatureVectorizer):
    """Vectorize sentences using an embedding model (:obj:`embed_model`) of type
    :class:`.WordEmbedModel`.

    The encoder returns the indicies of the word embedding for each token in
    the input :class:`.FeatureDocument`.  The decoder returns the corresponding
    word embedding vectors if :obj:`decode_embedding` is ``True``.  Otherwise
    it returns the same indicies, which later used by the embedding layer
    (usually :class:`~zensols.deepnlp.layer.EmbeddingLayer`).

    """
    DESCRIPTION = 'word vector document embedding'
    FEATURE_TYPE = TextFeatureType.EMBEDDING

    token_feature_id: str = field(default='norm')
    """The :class:`~zensols.nlp.tok.FeatureToken` attribute used to index the
    embedding vectors.

    """
    def _encode(self, doc: FeatureDocument) -> FeatureContext:
        emodel = self.embed_model
        tw: int = self.manager.get_token_length(doc)
        sents: Tuple[FeatureSentence] = doc.sents
        shape: Tuple[int, int] = (len(sents), tw)
        tfid: str = self.token_feature_id
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'using token length: {tw} with shape: {shape}, ' +
                         f'sents: {len(sents)}')
        arr = self.torch_config.empty(shape, dtype=torch.long)
        row: int
        sent: FeatureSentence
        for row, sent in enumerate(sents):
            tokens: List[FeatureToken] = sent.tokens[0:tw]
            slen: int = len(tokens)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'row: {row}, ' + 'toks: ' +
                             ' '.join(map(lambda x: x.norm, tokens)))
            tokens = list(map(lambda t: getattr(t, tfid), tokens))
            if slen < tw:
                tokens += [WordEmbedModel.ZERO] * (tw - slen)
            for i, tok in enumerate(tokens):
                arr[row][i] = emodel.word2idx_or_unk(tok)
        return TensorFeatureContext(self.feature_id, arr)

    @property
    @persisted('_vectors')
    def vectors(self) -> Tensor:
        embed_model: WordEmbedModel = self.embed_model
        return embed_model.to_matrix(self.torch_config)

    def _decode(self, context: FeatureContext) -> Tensor:
        x: Tensor = super()._decode(context)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'indexes: {x.shape} ({x.dtype}), ' +
                         f'will decode in vectorizer: {self.decode_embedding}')
        if self.decode_embedding:
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'decoding using: {self.decode_embedding}')
            src_vecs: Tensor = self.vectors
            batches: List[Tensor] = []
            vecs = []
            for batch_idx in x:
                for idxt in batch_idx:
                    vecs.append(src_vecs[idxt])
                batches.append(torch.stack(vecs))
                vecs.clear()
            x = torch.stack(batches)
        return x