Source code for zensols.deepnlp.embed.doc

"""A :class:`zensols.nlp.container.FeatureDocument` decorator that populates
sentence and token embeddings.

"""
__author__ = 'Paul Landes'

from typing import Optional, Union, List
from dataclasses import dataclass, field
import numpy as np
import torch
from torch import Tensor
from zensols.deeplearn import TorchConfig
from zensols.nlp import (
    FeatureToken, FeatureSentence, FeatureDocument, FeatureDocumentDecorator
)
from . import WordEmbedModel



[docs]
@dataclass
class WordEmbedDocumentDecorator(FeatureDocumentDecorator):
    """Populates sentence and token embeddings in the documents.  Token's have
    shape ``(1, d)`` where ``d`` is the embeddingn dimsion, and the first is
    always 1 to be compatible with word piece embeddings populated by
    :class:`..transformer.WordPieceDocumentDecorator`.

    :see: :class:`.WordEmbedModel`

    """
    model: WordEmbedModel = field()
    """The word embedding model for populating tokens and sentences."""

    torch_config: Optional[TorchConfig] = field(default=None)
    """The Torch configuration to allocate the embeddings from either the GPU or
    the CPU.  If ``None``, then Numpy :class:`numpy.ndarray` arrays are used
    instead of :class:`torch.Tensor`.

    """
    token_embeddings: bool = field(default=True)
    """Whether to add :class:`.WordPieceFeatureToken.embeddings`.

    """
    sent_embeddings: bool = field(default=True)
    """Whether to add class:`.WordPieceFeatureSentence.embeddings`.

    """
    skip_oov: bool = field(default=False)
    """Whether to skip out-of-vocabulary tokens that have no embeddings."""

    def _add_sent_embedding(self, sent: FeatureSentence):
        use_np: bool = self.torch_config is None
        add_tok_emb: bool = self.token_embeddings
        model: WordEmbedModel = self.model
        # our embedding will be a numpy array when no torch config is provided
        emb: Union[np.ndarray, Tensor]
        sembs: List[Union[np.ndarray, Tensor]] = []
        if use_np:
            # already a numpy array
            emb = model.matrix
        else:
            # convert to a torch tensor based on our configuration (i.e. device)
            emb = model.to_matrix(self.torch_config)
        tok: FeatureToken
        for tok in sent.token_iter():
            norm: str = tok.norm
            idx: int = model.word2idx(norm)
            if not self.skip_oov or idx is not None:
                if idx is None:
                    idx = model.unk_idx
                vec: Union[np.ndarray, Tensor] = emb[idx]
                sembs.append(vec)
                if add_tok_emb:
                    if use_np:
                        vec = np.expand_dims(vec, axis=0)
                    else:
                        vec = vec.unsqueeze(axis=0)
                    tok.embedding = vec
        # sentinel embeddings are the centroid for non-contextual embeddings
        if len(sembs) > 0 and self.sent_embeddings:
            if use_np:
                sent.embedding = np.stack(sembs).mean(axis=0)
            else:
                sent.embedding = torch.stack(sembs).mean(axis=0)


[docs]
    def decorate(self, doc: FeatureDocument):
        assert isinstance(self.model, WordEmbedModel)
        if self.token_embeddings or self.sent_embeddings:
            sent: FeatureSentence
            for sent in doc.sents:
                self._add_sent_embedding(sent)