"""Fast text word vector implementation.
"""
___author__ = 'Paul Landes'
from typing import List, Dict
from dataclasses import dataclass, field
import logging
from h5py import Dataset
from zensols.deepnlp.embed import TextWordEmbedModel, TextWordModelMetadata
logger = logging.getLogger(__name__)
[docs]
@dataclass
class FastTextEmbedModel(TextWordEmbedModel):
"""This class reads the FastText word vector text data format and provides an
instances of a :class:`.WordEmbedModel`. Files that have the format that
look like ``crawl-300d-2M.vec`` can be downloaded with the link below.
:see: `English word vectors <https://fasttext.cc/docs/en/english-vectors.html>`_
"""
desc: str = field(default='2M')
"""The size description (i.e. 6B for the six billion word trained vectors).
"""
dimension: str = field(default=300)
"""The word vector dimension."""
corpus: str = field(default='crawl')
"""The corpus the embeddings were trained on, such as ``crawl`` and ``web``.
"""
def _get_metadata(self) -> TextWordModelMetadata:
name = 'fasttext'
# crawl-300d-2M.vec
path = self.path
desc = f'{self.corpus}-{self.desc}'
with open(path, encoding='utf-8',
newline='\n', errors='ignore') as f:
vocab_size, dim = map(int, f.readline().split())
return TextWordModelMetadata(name, desc, dim, vocab_size, path)
def _populate_vec_lines(self, words: List[str], word2idx: Dict[str, int],
ds: Dataset):
meta = self.metadata
idx = 0
lc = 0
with open(meta.source_path, encoding='utf-8',
newline='\n', errors='ignore') as f:
n_vocab, dim = map(int, f.readline().split())
for rix, ln in enumerate(f):
lc += 1
line = ln.rstrip().split(' ')
word = line[0]
words.append(word)
word2idx[word] = idx
idx += 1
try:
ds[rix, :] = line[1:]
except Exception as e:
logger.error(f'could not parse line {lc} ' +
f'(word: {word}): {e}; line: {ln}')
raise e