Source code for zensols.mednlp.cui2vec
"""This module contains the embedding subclass for cui2vec embeddings.
"""
__author__ = 'Paul Landes'
from typing import Dict, List
from dataclasses import dataclass, field
import logging
import csv
from h5py import Dataset
from zensols.deepnlp.embed import (
WordEmbedError, TextWordEmbedModel, TextWordModelMetadata
)
logger = logging.getLogger(__name__)
[docs]
@dataclass
class Cui2VecEmbedModel(TextWordEmbedModel):
"""This class uses the pretrained cui2vec embeddings.
"""
dimension: str = field(default=500)
"""The word vector dimension."""
vocab_size: int = field(default=109053)
"""Vocabulary size."""
def _populate_vec_lines(self, words: List[str], word2idx: Dict[str, int],
ds: Dataset):
idx = 0
lc = 0
meta = self.metadata
with open(meta.source_path) as csvfile:
csv_reader = csv.reader(csvfile)
next(csv_reader)
for rix, line in enumerate(csv_reader):
lc += 1
word = line[0]
words.append(word)
word2idx[word] = idx
idx += 1
try:
ds[rix, :] = tuple(map(float, line[1:]))
except Exception as e:
raise WordEmbedError(
f'Could not parse line {lc} (word: {word}): ' +
f'{e}; line: {line}') from e
def _get_metadata(self) -> TextWordModelMetadata:
name = 'cui2vec'
dim = self.dimension
path = self.path.parent / self.resource.check_path
return TextWordModelMetadata(
name, 'default', dim, self.vocab_size, path,
sub_directory='cui2vec-bin')