Source code for zensols.deepnlp.embed.doc
"""A :class:`zensols.nlp.container.FeatureDocument` decorator that populates
sentence and token embeddings.
"""
__author__ = 'Paul Landes'
from typing import Optional, Union, List
from dataclasses import dataclass, field
import numpy as np
import torch
from torch import Tensor
from zensols.deeplearn import TorchConfig
from zensols.nlp import (
FeatureToken, FeatureSentence, FeatureDocument, FeatureDocumentDecorator
)
from . import WordEmbedModel
[docs]
@dataclass
class WordEmbedDocumentDecorator(FeatureDocumentDecorator):
"""Populates sentence and token embeddings in the documents. Token's have
shape ``(1, d)`` where ``d`` is the embeddingn dimsion, and the first is
always 1 to be compatible with word piece embeddings populated by
:class:`..transformer.WordPieceDocumentDecorator`.
:see: :class:`.WordEmbedModel`
"""
model: WordEmbedModel = field()
"""The word embedding model for populating tokens and sentences."""
torch_config: Optional[TorchConfig] = field(default=None)
"""The Torch configuration to allocate the embeddings from either the GPU or
the CPU. If ``None``, then Numpy :class:`numpy.ndarray` arrays are used
instead of :class:`torch.Tensor`.
"""
token_embeddings: bool = field(default=True)
"""Whether to add :class:`.WordPieceFeatureToken.embeddings`.
"""
sent_embeddings: bool = field(default=True)
"""Whether to add class:`.WordPieceFeatureSentence.embeddings`.
"""
skip_oov: bool = field(default=False)
"""Whether to skip out-of-vocabulary tokens that have no embeddings."""
def _add_sent_embedding(self, sent: FeatureSentence):
use_np: bool = self.torch_config is None
add_tok_emb: bool = self.token_embeddings
model: WordEmbedModel = self.model
# our embedding will be a numpy array when no torch config is provided
emb: Union[np.ndarray, Tensor]
sembs: List[Union[np.ndarray, Tensor]] = []
if use_np:
# already a numpy array
emb = model.matrix
else:
# convert to a torch tensor based on our configuration (i.e. device)
emb = model.to_matrix(self.torch_config)
tok: FeatureToken
for tok in sent.token_iter():
norm: str = tok.norm
idx: int = model.word2idx(norm)
if not self.skip_oov or idx is not None:
if idx is None:
idx = model.unk_idx
vec: Union[np.ndarray, Tensor] = emb[idx]
sembs.append(vec)
if add_tok_emb:
if use_np:
vec = np.expand_dims(vec, axis=0)
else:
vec = vec.unsqueeze(axis=0)
tok.embedding = vec
# sentinel embeddings are the centroid for non-contextual embeddings
if len(sembs) > 0 and self.sent_embeddings:
if use_np:
sent.embedding = np.stack(sembs).mean(axis=0)
else:
sent.embedding = torch.stack(sembs).mean(axis=0)
[docs]
def decorate(self, doc: FeatureDocument):
assert isinstance(self.model, WordEmbedModel)
if self.token_embeddings or self.sent_embeddings:
sent: FeatureSentence
for sent in doc.sents:
self._add_sent_embedding(sent)