Source code for zensols.deepnlp.embed.glove
"""This module contains the definition of a class that operates like a dict to
retrieve GloVE word embeddings. It also creates, stores and reads a binary
representation for quick loading on start up.
"""
__author__ = 'Paul Landes'
from dataclasses import dataclass, field
import logging
from pathlib import Path
from . import TextWordEmbedModel, TextWordModelMetadata
logger = logging.getLogger(__name__)
[docs]
@dataclass
class GloveWordEmbedModel(TextWordEmbedModel):
"""This class uses the Stanford pretrained GloVE embeddings as a ``dict`` like
Python object. It loads the glove vectors from a text file and then
creates a binary file that's quick to load on subsequent uses.
An example configuration would be::
[glove_embedding]
class_name = zensols.deepnlp.embed.GloveWordEmbedModel
path = path: ${default:corpus_dir}/glove
desc = 6B
dimension = 50
"""
desc: str = field(default='6B')
"""The size description (i.e. 6B for the six billion word trained vectors).
"""
dimension: int = field(default=50)
"""The word vector dimension."""
vocab_size: int = field(default=400000)
"""Vocabulary size."""
def _install(self) -> Path:
self.installer()
return self.installer[self.resource].parent
def _get_metadata(self) -> TextWordModelMetadata:
name = 'glove'
dim = self.dimension
desc = self.desc
path = self.path / f'{name}.{desc}.{dim}d.txt'
return TextWordModelMetadata(name, desc, dim, self.vocab_size, path)