"""Fast text word vector implementation."""___author__='Paul Landes'fromtypingimportList,Dictfromdataclassesimportdataclass,fieldimportloggingfromh5pyimportDatasetfromzensols.deepnlp.embedimportTextWordEmbedModel,TextWordModelMetadatalogger=logging.getLogger(__name__)
[docs]@dataclassclassFastTextEmbedModel(TextWordEmbedModel):"""This class reads the FastText word vector text data format and provides an instances of a :class:`.WordEmbedModel`. Files that have the format that look like ``crawl-300d-2M.vec`` can be downloaded with the link below. :see: `English word vectors <https://fasttext.cc/docs/en/english-vectors.html>`_ """desc:str=field(default='2M')"""The size description (i.e. 6B for the six billion word trained vectors). """dimension:str=field(default=300)"""The word vector dimension."""corpus:str=field(default='crawl')"""The corpus the embeddings were trained on, such as ``crawl`` and ``web``. """def_get_metadata(self)->TextWordModelMetadata:name='fasttext'# crawl-300d-2M.vecpath=self.pathdesc=f'{self.corpus}-{self.desc}'withopen(path,encoding='utf-8',newline='\n',errors='ignore')asf:vocab_size,dim=map(int,f.readline().split())returnTextWordModelMetadata(name,desc,dim,vocab_size,path)def_populate_vec_lines(self,words:List[str],word2idx:Dict[str,int],ds:Dataset):meta=self.metadataidx=0lc=0withopen(meta.source_path,encoding='utf-8',newline='\n',errors='ignore')asf:n_vocab,dim=map(int,f.readline().split())forrix,lninenumerate(f):lc+=1line=ln.rstrip().split(' ')word=line[0]words.append(word)word2idx[word]=idxidx+=1try:ds[rix,:]=line[1:]exceptExceptionase:logger.error(f'could not parse line {lc} '+f'(word: {word}): {e}; line: {ln}')raisee