[docs]@dataclassclassTopicModelDocumentIndexerVectorizer(DocumentIndexVectorizer):"""Train a model using LDA for topic modeling. Citation: Hoffman, M., Bach, F., and Blei, D. 2010. Online Learning for Latent Dirichlet Allocation. Advances in Neural Information Processing Systems 23. :shape: ``(topics, )`` when ``decode_as_flat`` is ``True, otherwise, ``(, topics)`` :see: :class:`gensim.models.ldamodel.LdaModel` """DESCRIPTION='latent semantic indexing'FEATURE_TYPE=TextFeatureType.DOCUMENTtopics:int=field(default=20)"""The number of topics (usually denoted ``K``)."""decode_as_flat:bool=field(default=True)"""If ``True``, flatten the tensor after decoding."""def_get_shape(self)->Tuple[int,int]:ifself.decode_as_flat:returnself.topics,else:return1,self.topicsdef_create_model(self,docs:Iterable[FeatureDocument])->Any:iflogger.isEnabledFor(logging.INFO):logger.info(f'creating {self.topics} topics')docs=tuple(map(lambdadoc:self.feat_to_tokens(doc),docs))id2word=corpora.Dictionary(docs)corpus=tuple(map(lambdadoc:id2word.doc2bow(doc),docs))rand_state=TorchConfig.get_random_seed()ifrand_stateisNone:rand_state=0params={'corpus':corpus,'id2word':id2word,'num_topics':self.topics,'random_state':rand_state,'update_every':1,'chunksize':100,'passes':10,'alpha':'auto','per_word_topics':True}withtime(f'modeled {self.topics} acros {len(docs)} documents'):lda=LdaModel(**params)return{'lda':lda,'corpus':corpus,'id2word':id2word}
[docs]defquery(self,tokens:Tuple[str])->Tuple[float]:"""Return a distribution over the topics for a query set of tokens. :param tokens: the string list of tokens to use for inferencing in the model :return: a list of tuples in the form ``(topic_id, probability)`` """lda=self.model['lda']id2word=self.model['id2word']docs_q=[tokens]corpus_q=tuple(map(lambdadoc:id2word.doc2bow(doc),docs_q))returnlda.get_document_topics(corpus_q,minimum_probability=0)[0]