Source code for zensols.deepnlp.embed.domain

"""Interface file for word vectors, aka non-contextual word embeddings.

"""
__author__ = 'Paul Landes'

from typing import List, Dict, Tuple, Iterable, ClassVar, Optional
from dataclasses import dataclass, field
from abc import ABCMeta, abstractmethod
import logging
import numpy as np
import torch
from torch import Tensor
import gensim
from gensim.models.keyedvectors import Word2VecKeyedVectors, KeyedVectors
from zensols.persist import persisted, PersistableContainer, PersistedWork
from zensols.deeplearn import TorchConfig, DeepLearnError

logger = logging.getLogger(__name__)


[docs] class WordEmbedError(DeepLearnError): """Raised for any errors pertaining to word vectors. """ pass
[docs] @dataclass class WordVectorModel(object): """Vector data from the model """ vectors: np.ndarray = field() """The word vectors.""" word2vec: Dict[str, np.ndarray] = field() """The word to word vector mapping.""" words: List[str] = field() """The vocabulary.""" word2idx: Dict[str, int] = field() """The word to word vector index mapping.""" def __post_init__(self): self.tensors = {}
[docs] def to_matrix(self, torch_config: TorchConfig) -> torch.Tensor: dev = torch_config.device if dev in self.tensors: if logger.isEnabledFor(logging.INFO): logger.info(f'reusing already cached from {torch_config}') vecs = self.tensors[dev] else: if logger.isEnabledFor(logging.INFO): logger.info(f'created tensor vectory matrix on {torch_config}') vecs = torch_config.from_numpy(self.vectors) self.tensors[dev] = vecs return vecs
@dataclass class _WordEmbedVocabAdapter(object): """Adapts a :class:`.WordEmbedModel` to a gensim :class:`.KeyedVectors`, which is used in :meth:`.WordEmbedModel._create_keyed_vectors`. """ model: WordVectorModel = field() def __post_init__(self): self._index = -1 @property def index(self): return self._index def __iter__(self): words: List[str] = self.model.words return iter(words) def get(self, word: int, default: str): self._index = self.model.word2idx.get(word, default) def __getitem__(self, word: str): self._index = self.model.word2idx[word] return self
[docs] @dataclass class WordEmbedModel(PersistableContainer, metaclass=ABCMeta): """This is an abstract base class that represents a set of word vectors (i.e. GloVe). """ UNKNOWN: ClassVar[str] = '<unk>' """The unknown symbol used for out of vocabulary words.""" ZERO: ClassVar[str] = UNKNOWN """The zero vector symbol used for padding vectors.""" _CACHE: ClassVar[Dict[str, WordVectorModel]] = {} """Contains cached embedding model that point to the same source.""" name: str = field() """The name of the model given by the configuration and must be unique across word vector type and dimension. """ cache: bool = field(default=True) """If ``True`` globally cache all data strucures, which should be ``False`` if more than one embedding across a model type is used. """ lowercase: bool = field(default=False) """If ``True``, downcase each word for all methods that take a word as input. Use this for embeddings that are only lower case in order to find more hits when querying for words that have uppercase characters. """ def __post_init__(self): super().__init__() self._data_inst = PersistedWork('_data_inst', self, transient=True) @abstractmethod def _get_model_id(self) -> str: """Return a string that uniquely identifies this instance of the embedding model. This should have the type, size and dimension of the embedding. :see: :obj:`model_id` """ pass @abstractmethod def _create_data(self) -> WordVectorModel: """Return the vector data from the model in the form: (vectors, word2vec, words, word2idx) where: """ pass
[docs] def clear_cache(self): for model in self._CACHE.values(): self._try_deallocate(model) self._CACHE.clear()
[docs] def deallocate(self): self.clear_cache() super().deallocate()
@property def model_id(self) -> str: """Return a string that uniquely identifies this instance of the embedding model. This should have the type, size and dimension of the embedding. This string is used to cache models in both CPU and GPU memory so the layers can have the benefit of reusing the same in memeory word embedding matrix. """ return self._get_model_id() @persisted('_data_inst', transient=True) def _data(self) -> WordVectorModel: model_id = self.model_id wv_model: WordVectorModel = self._CACHE.get(model_id) if wv_model is None: wv_model = self._create_data() if self.cache: self._CACHE[model_id] = wv_model return wv_model @property def matrix(self) -> np.ndarray: """The word vector matrix.""" return self._data().vectors @property def shape(self) -> Tuple[int, int]: """The shape of the word vector :obj"`matrix`.""" return self.matrix.shape
[docs] def to_matrix(self, torch_config: TorchConfig) -> Tensor: """Return a matrix the represents the entire vector embedding as a tensor. :param torch_config: indicates where to load the new tensor """ return self._data().to_matrix(torch_config)
@property def vectors(self) -> Dict[str, np.ndarray]: """Return all word vectors with the string words as keys. """ return self._data().word2vec @property def vector_dimension(self) -> int: """Return the dimension of the word vectors. """ return self.matrix.shape[1]
[docs] def keys(self) -> Iterable[str]: """Return the keys, which are the word2vec words. """ return self.vectors.keys()
@property @persisted('_unk_idx') def unk_idx(self) -> int: """The ID to the out-of-vocabulary index""" model: WordVectorModel = self._data() word2idx: Dict[str, int] = model.word2idx return word2idx.get(self.UNKNOWN)
[docs] def word2idx(self, word: str, default: int = None) -> Optional[int]: """Return the index of ``word`` or :obj:`UNKONWN` if not indexed. """ if self.lowercase: word = word.lower() model: WordVectorModel = self._data() word2idx: Dict[str, int] = model.word2idx idx: int = word2idx.get(word) if idx is None: idx = default return idx
[docs] def word2idx_or_unk(self, word: str) -> int: """Return the index of ``word`` or :obj:`UNKONWN` if not indexed. """ return self.word2idx(word, self.unk_idx)
[docs] def prime(self): pass
[docs] def get(self, key: str, default: np.ndarray = None) -> np.ndarray: """Just like a ``dict.get()``, but but return the vector for a word. :param key: the word to get the vector :param default: what to return if ``key`` doesn't exist in the dict :return: the word vector """ if self.lowercase: key = key.lower() return self.vectors.get(key, default)
@property @persisted('_keyed_vectors', transient=True) def keyed_vectors(self) -> KeyedVectors: """Adapt instances of this class to a gensim keyed vector instance.""" return self._create_keyed_vectors() def _create_keyed_vectors(self) -> KeyedVectors: kv = Word2VecKeyedVectors(vector_size=self.vector_dimension) if gensim.__version__[0] >= '4': kv.key_to_index = self._data().word2idx else: kv.vocab = _WordEmbedVocabAdapter(self._data()) kv.vectors = self.matrix kv.index2entity = list(self._data().words) return kv def __getitem__(self, key: str): if self.lowercase: key = key.lower() return self.vectors[key] def __contains__(self, key: str): if self.lowercase: key = key.lower() return key in self.vectors def __len__(self): return self.matrix.shape[0] def __str__(self): s = f'{self.__class__.__name__} ({self.name}): id={self.model_id}' if self._data_inst.is_set(): s += f', num words={len(self)}, dim={self.vector_dimension}' return s