Source code for zensols.deepnlp.classify.pred

"""Prediction mapper support for NLP applications.

"""
__author__ = 'Paul Landes'

from typing import Tuple, List, Iterable
from dataclasses import dataclass, field
from itertools import chain as ch
import numpy as np
from zensols.config import Settings
from zensols.nlp import FeatureSentence, FeatureDocument
from zensols.deeplearn.vectorize import CategoryEncodableFeatureVectorizer
from zensols.deeplearn.model import PredictionMapper
from zensols.deeplearn.result import ResultsContainer
from zensols.deepnlp.vectorize import FeatureDocumentVectorizerManager
from . import LabeledFeatureDocument


[docs] @dataclass class ClassificationPredictionMapper(PredictionMapper): """A prediction mapper for text classification. This mapper works at any level (document, sentence, token). """ vec_manager: FeatureDocumentVectorizerManager = field() """The vectorizer manager used to parse and get the label vectorizer.""" label_feature_id: str = field() """The feature ID for the label vectorizer.""" pred_attribute: str = field(default='pred') """The prediction attribute to set on the :class:`.FeatureDocument` returned from :meth:`map_results`. """ softmax_logit_attribute: str = field(default='softmax_logit') """The softmax of the logits attribute to set on the :class:`.FeatureDocument` returned from :meth:`map_results`. :see: `On Calibration of Modern Neural Networks <https://arxiv.org/abs/1706.04599>`_ """ def __post_init__(self): super().__post_init__() self._docs: List[FeatureDocument] = [] @property def label_vectorizer(self) -> CategoryEncodableFeatureVectorizer: """The label vectorizer used to map classes in :meth:`get_classes`.""" return self.vec_manager[self.label_feature_id] def _create_features(self, sent_text: str) -> Tuple[FeatureDocument, ...]: doc: FeatureDocument = self.vec_manager.parse(sent_text) self._docs.append(doc) return [doc] def _map_classes(self, result: ResultsContainer) -> List[List[str]]: """Return the label string values for indexes ``nominals``. :param nominals: the integers that map to the respective string class; each tuple is a batch, and each item in the iterable is a data point :return: a list for every tuple in ``nominals`` """ vec: CategoryEncodableFeatureVectorizer = self.label_vectorizer nominals: List[np.ndarray] = result.batch_predictions return list(map(lambda cl: vec.get_classes(cl).tolist(), nominals))
[docs] def map_results(self, result: ResultsContainer) -> \ Tuple[LabeledFeatureDocument, ...]: """Map class predictions, logits, and documents generated during use of this instance. Each data point is aggregated across batches. :return: a :class:`.Settings` instance with ``classess``, ``logits`` and ``docs`` attributes """ class_groups: List[List[str]] = self._map_classes(result) classes: Iterable[str] = ch.from_iterable(class_groups) logits: Iterable[np.ndarray] = ch.from_iterable(result.batch_outputs) docs: List[FeatureDocument] = self._docs labels: List[str] = self.label_vectorizer.label_encoder.classes_ for cl, doc, logits in zip(classes, docs, logits): conf = np.exp(logits) / sum(np.exp(logits)) sms = dict(zip(labels, conf)) setattr(doc, self.pred_attribute, cl) setattr(doc, self.softmax_logit_attribute, sms) return tuple(docs)
[docs] @dataclass class SequencePredictionMapper(ClassificationPredictionMapper): """Predicts sequences as a :class:`~zensols.config.serial.Settings` with keys `classes` as the token level predictions and `docs` containing the parsed documents from the sentence text. """ def _create_features(self, sent_text: str) -> Tuple[FeatureSentence, ...]: doc: FeatureDocument = self.vec_manager.parse(sent_text) self._docs.append(doc) return doc.sents
[docs] def map_results(self, result: ResultsContainer) -> Settings: classes: List[List[int]] = self._map_classes(result) return Settings(classes=tuple(classes), docs=tuple(self._docs))