Source code for zensols.deepnlp.embed.fasttext

"""Fast text word vector implementation.

"""
___author__ = 'Paul Landes'

from typing import List, Dict
from dataclasses import dataclass, field
import logging
from h5py import Dataset
from zensols.deepnlp.embed import TextWordEmbedModel, TextWordModelMetadata

logger = logging.getLogger(__name__)



[docs]
@dataclass
class FastTextEmbedModel(TextWordEmbedModel):
    """This class reads the FastText word vector text data format and provides an
    instances of a :class:`.WordEmbedModel`.  Files that have the format that
    look like ``crawl-300d-2M.vec`` can be downloaded with the link below.

    :see: `English word vectors <https://fasttext.cc/docs/en/english-vectors.html>`_

    """
    desc: str = field(default='2M')
    """The size description (i.e. 6B for the six billion word trained vectors).

    """
    dimension: str = field(default=300)
    """The word vector dimension."""

    corpus: str = field(default='crawl')
    """The corpus the embeddings were trained on, such as ``crawl`` and ``web``.

    """
    def _get_metadata(self) -> TextWordModelMetadata:
        name = 'fasttext'
        # crawl-300d-2M.vec
        path = self.path
        desc = f'{self.corpus}-{self.desc}'
        with open(path, encoding='utf-8',
                  newline='\n', errors='ignore') as f:
            vocab_size, dim = map(int, f.readline().split())
        return TextWordModelMetadata(name, desc, dim, vocab_size, path)

    def _populate_vec_lines(self, words: List[str], word2idx: Dict[str, int],
                            ds: Dataset):
        meta = self.metadata
        idx = 0
        lc = 0
        with open(meta.source_path, encoding='utf-8',
                  newline='\n', errors='ignore') as f:
            n_vocab, dim = map(int, f.readline().split())
            for rix, ln in enumerate(f):
                lc += 1
                line = ln.rstrip().split(' ')
                word = line[0]
                words.append(word)
                word2idx[word] = idx
                idx += 1
                try:
                    ds[rix, :] = line[1:]
                except Exception as e:
                    logger.error(f'could not parse line {lc} ' +
                                 f'(word: {word}): {e}; line: {ln}')
                    raise e