Source code for zensols.deepnlp.embed.wordtext

"""Contains an abstract class that makes it easier to implement load word
vectors from text files.

"""
__author__ = 'Paul Landes'

from typing import List, Dict
from dataclasses import dataclass, field, InitVar
from abc import abstractmethod, ABCMeta
import logging
from pathlib import Path
import pickle
import numpy as np
import h5py
from h5py import Dataset
from zensols.util import time
from zensols.config import Dictable
from zensols.persist import Primeable
from zensols.install import Installer, Resource
from zensols.deepnlp.embed import WordVectorModel, WordEmbedModel
from . import WordEmbedError

logger = logging.getLogger(__name__)



[docs]
@dataclass
class TextWordModelMetadata(Dictable):
    """Describes a text based :class:`.WordEmbedModel`.  This information in
    this class is used to construct paths both text source vector file and all
    generated binary files

    """
    name: str = field()
    """The name of the word vector set (i.e. glove)."""

    desc: str = field()
    """A descriptor about this particular word vector set (i.e. 6B)."""

    dimension: int = field()
    """The dimension of the word vectors."""

    n_vocab: int = field()
    """The number of words in the vocabulary."""

    source_path: Path = field()
    """The path to the text file."""

    sub_directory: InitVar[Path] = field(default=None)
    """The subdirectory to be appended to :obj:`self.bin_dir`, which defaults to
    the directory ``bin/<description>.<dimension>``.

    """
    def __post_init__(self, sub_directory: Path):
        if sub_directory is None:
            fname: str = f'{self.name}.{self.desc}.{self.dimension}'
            sub_directory = Path('bin', fname)
        self.bin_dir = self.source_path.parent / sub_directory
        self.bin_file = self.bin_dir / 'vec.dat'
        self.words_file = self.bin_dir / 'words.dat'
        self.idx_file = self.bin_dir / 'idx.dat'




[docs]
@dataclass
class TextWordEmbedModel(WordEmbedModel, Primeable, metaclass=ABCMeta):
    """Extensions of this class read a text vectors file and compile, then write
    a binary representation for fast loading.

    """
    DATASET_NAME = 'vec'
    """Name of the dataset in the HD5F file."""

    path: Path = field(default=None)
    """The path to the model file(s)."""

    installer: Installer = field(default=None)
    """The installer used to for the text vector zip file."""

    resource: Resource = field(default=None)
    """The zip resource used to find the path to the model files."""

    @abstractmethod
    def _get_metadata(self) -> TextWordModelMetadata:
        """Create the metadata used to construct paths both text source vector
        file and all generated binary files.

        """
        pass

    def _install(self) -> Path:
        """Install any missing word vector models."""
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'install resource for {self.name}: {self.resource}')
        self.installer()
        return self.installer[self.resource]

    @property
    def metadata(self):
        """Return the metadata used to construct paths both text source vector
        file and all generated binary files.

        """
        if not hasattr(self, '_metadata'):
            if self.path is None and self.installer is None:
                raise WordEmbedError('No path is not set')
            if self.installer is not None and self.resource is None:
                raise WordEmbedError("Installer given but not 'resource''")
            if self.installer is not None:
                self.path = self._install()
            self._metadata = self._get_metadata()
        return self._metadata

    def _get_model_id(self) -> str:
        """Return a string used to uniquely identify this model.

        """
        meta = self.metadata
        return f'{meta.name}: description={meta.desc}, dim={meta.dimension}'

    def _populate_vec_lines(self, words: List[str], word2idx: Dict[str, int],
                            ds: Dataset):
        """Add word vectors to the h5py dataset, vocab and vocab index.

        :param words: the list of vocabulary words

        :param word2idx: dictionary of word to word vector index (row)

        :param ds: the h5py data structure to add the word vectors

        """
        meta = self.metadata
        idx = 0
        lc = 0
        with open(meta.source_path, 'rb') as f:
            for rix, ln in enumerate(f):
                lc += 1
                line = ln.decode().strip().split(' ')
                word = line[0]
                words.append(word)
                word2idx[word] = idx
                idx += 1
                try:
                    ds[rix, :] = line[1:]
                except Exception as e:
                    raise WordEmbedError(
                        f'Could not parse line {lc} (word: {word}): ' +
                        f'{e}; line: {ln}') from e

    def _write_vecs(self) -> np.ndarray:
        """Write the h5py binary files.  Only when they do not exist on the
        files system already are they calculated and written.

        """
        meta = self.metadata
        meta.bin_dir.mkdir(parents=True, exist_ok=True)
        words = []
        word2idx = {}
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'writing binary vectors {meta.source_path} ' +
                        f'-> {meta.bin_dir}')
        shape = (meta.n_vocab, meta.dimension)
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'creating h5py binary vec files with shape {shape}:')
            meta.write_to_log(logger, logging.INFO, 1)
        with time(f'wrote h5py to {meta.bin_file}'):
            with h5py.File(meta.bin_file, 'w') as f:
                dset: Dataset = f.create_dataset(
                    self.DATASET_NAME, shape, dtype='float64')
                self._populate_vec_lines(words, word2idx, dset)
        with open(meta.words_file, 'wb') as f:
            pickle.dump(words[:], f)
        with open(meta.idx_file, 'wb') as f:
            pickle.dump(word2idx, f)

    def _assert_binary_vecs(self):
        meta = self.metadata
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'{meta.bin_file} exists: {meta.bin_file.exists()}')
        if not meta.bin_file.exists():
            if logger.isEnabledFor(logging.INFO):
                logger.info(f'writing binary vectors to: {meta.bin_file}')
            self._write_vecs()


[docs]
    def prime(self):
        self._assert_binary_vecs()


    def _create_data(self) -> WordVectorModel:
        """Read the binary bcolz, vocabulary and index files from disk.

        """
        self._assert_binary_vecs()
        meta = self.metadata
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'reading binary vector file: {meta.bin_file}')
        with time('loaded {cnt} vectors'):
            with h5py.File(meta.bin_file, 'r') as f:
                ds: Dataset = f[self.DATASET_NAME]
                vectors: np.ndarray = ds[:]
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'word embedding type: {vectors.dtype}')
            with open(meta.words_file, 'rb') as f:
                words = pickle.load(f)
            with open(meta.idx_file, 'rb') as f:
                word2idx = pickle.load(f)
            cnt = len(word2idx)
        with time('prepared vectors'):
            unknown_vec: np.ndarray = np.expand_dims(
                np.zeros(self.dimension), axis=0)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'unknown type: {unknown_vec.dtype}')
            vectors: np.ndarray = np.concatenate((vectors, unknown_vec))
            word2idx[self.UNKNOWN] = len(words)
            words.append(self.UNKNOWN)
            word2vec = {w: vectors[word2idx[w]] for w in words}
        return WordVectorModel(vectors, word2vec, words, word2idx)




[docs]
@dataclass
class DefaultTextWordEmbedModel(TextWordEmbedModel):
    """This class uses the Stanford pretrained GloVE embeddings as a ``dict``
    like Python object.  It loads the glove vectors from a text file and then
    creates a binary file that's quick to load on subsequent uses.

    An example configuration would be::

        [glove_embedding]
        class_name = zensols.deepnlp.embed.GloveWordEmbedModel
        path = path: ${default:corpus_dir}/glove
        desc = 6B
        dimension = 50

    """
    name: str = field(default='unknown_name')
    """The name of the word vector set (i.e. glove)."""

    desc: str = field(default='unknown_desc')
    """The size description (i.e. 6B for the six billion word trained vectors).

    """
    dimension: int = field(default=50)
    """The word vector dimension."""

    vocab_size: int = field(default=0)
    """Vocabulary size."""

    file_name_pattern: str = field(default='{name}.{desc}.{dimension}d.txt')
    """The format of the file to create."""

    @property
    def file_name(self) -> str:
        return self.file_name_pattern.format(
            name=self.name,
            desc=self.desc,
            dimension=self.dimension)

    def _get_metadata(self) -> TextWordModelMetadata:
        name: str = self.name
        dim: int = self.dimension
        desc: str = self.desc
        path: Path = self.path / self.file_name
        return TextWordModelMetadata(name, desc, dim, self.vocab_size, path)