Source code for zensols.deepnlp.transformer.pred

"""Predictions output for transformer models.

"""
__author__ = 'Paul Landes'

from typing import Callable, List, Tuple, Dict, Iterable, Any
from dataclasses import dataclass, field
import logging
import pandas as pd
from torch import Tensor
from zensols.nlp import FeatureToken, FeatureSentence, FeatureDocument
from zensols.deeplearn.batch import Batch, DataPoint
from zensols.deeplearn.result import SequencePredictionsDataFrameFactory
from . import TokenizedDocument

logger = logging.getLogger(__name__)


[docs] @dataclass class TransformerSequencePredictionsDataFrameFactory( SequencePredictionsDataFrameFactory): """Like the super class but create predictions for transformer sequence models. By default, transformer input is truncated at the model's max token length (usually 512 word piece tokens). It then truncate the tokens that are added as the ``text`` column from (configured by default) :class:`..classify.TokenClassifyModelFacade`. For all predictions where the sequence passed the model's maximum, this class maps that last word piece token output to the respective token in the :obj:`predictions_dataframe_factory_class` instance's ``transform`` output. """ embedded_document_attribute: str = field(default=None) """The :obj:`~zensols.deeplearn.batch.domain.Batch` attribute key for the tensor that contains the vectorized document. """ def _trunc_tokens(self, batch: Batch) -> Iterable[Tuple[FeatureToken, ...]]: """Return tokens truncated at the length of the last word piece token. :param batch: contains the ``data_points`` with sentences to truncate :return: the truncated tokens for each data point in ``batch`` """ # merge documents from the data points into a document for the batch dps_doc: FeatureDocument = FeatureDocument.combine_documents( map(lambda dp: dp.doc, batch.data_points)) # re-hydrate the vectorized document from the batch tensor emb: Tensor = batch[self.embedded_document_attribute] tdoc: TokenizedDocument = TokenizedDocument.from_tensor(emb) # map word piece tokens to feature document tokens sent_maps: List[Dict[str, Any]] = tdoc.map_to_word_pieces( sentences=dps_doc, includes={'map', 'sent'}) for dpix, dp in enumerate(batch.data_points): tmap: Tuple[FeatureToken, Tuple[Tuple[str, int, int], ...]] = \ sent_maps[dpix]['map'] yield len(tmap) def _calc_len(self, batch: Batch) -> int: trunc_lens: Tuple[int] = tuple(self._trunc_tokens(batch)) batch._trunc_lens = trunc_lens return sum(map(lambda tl: tl, trunc_lens)) def _transform_dataframe(self, batch: Batch, labs: List[str], preds: List[str]): dfs: List[pd.DataFrame] = [] start: int = 0 transform: Callable = self.data_point_transform self._assert_label_pred_batch_size(batch, labs, preds, False) dp: DataPoint tl: int for dp, tl in zip(batch.data_points, batch._trunc_lens): end: int = start + tl df = pd.DataFrame({ self.ID_COL: dp.id, self.LABEL_COL: labs[start:end], self.PREDICTION_COL: preds[start:end]}) dp_data: Tuple[Tuple[str, ...]] = transform(dp) if len(dp_data) != tl and logger.isEnabledFor(logging.WARNING): sent_str: str = '' if hasattr(dp, 'doc'): doc: FeatureDocument = dp.doc sent_str: str = f' for document: {doc}' logger.warning( f'trimming outcomes from {len(dp_data)} ' + f'to word piece max (equivalent) {tl}{sent_str}') dp_data = dp_data[:tl] df[list(self.column_names)] = dp_data dfs.append(df) start = end return pd.concat(dfs)