Source code for zensols.deepnlp.transformer.mask

"""Classes to predict fill-mask tasks.

"""
__author__ = 'Paul Landes'

from typing import Tuple, List, Iterable, Dict, Any
from dataclasses import dataclass, field
import logging
import sys
from collections import OrderedDict
from io import TextIOBase
import pandas as pd
import torch
from torch import Tensor
from torch.return_types import topk
from transformers import PreTrainedTokenizer, PreTrainedModel
from zensols.config import Dictable
from zensols.nlp import FeatureToken, TokenContainer
from zensols.deeplearn import TorchConfig
from zensols.deepnlp.transformer import TransformerResource
from . import TransformerError

logger = logging.getLogger(__name__)


[docs] @dataclass class TokenPrediction(Dictable): """Couples a masked model prediction token to which it belongs and its score. """ token: FeatureToken = field() prediction: str = field() score: float = field() def __str__(self) -> str: return f"{self.token} -> {self.prediction} ({self.score:.4f})"
[docs] @dataclass class Prediction(Dictable): """A container class for masked token predictions produced by :class:`.MaskFiller`. This class offers many ways to get the predictions, including getting the sentences as instances of :class:`~zensols.nlp.container.TokenContainer` by using it as an iterable. The sentences are also available as the ``pred_sentences`` key when using :meth:`~zensols.config.dictable.Dictable.asdict`. """ cont: TokenContainer = field() """The document, sentence or span to predict masked tokens.""" masked_tokens: Tuple[FeatureToken] = field() """The masked tokens matched.""" df: pd.DataFrame = field() """The predictions with dataframe columns: * ``k``: the *k* in the top-*k* highest scored masked token match * ``mask_id``: the N-th masked token in the source ordered by position * ``token``: the predicted token * ``score``: the score of the prediction (``[0, 1]``, higher the better) """
[docs] def get_container(self, k: int = 0) -> TokenContainer: """Get the *k*-th top scored sentence. This method should be called only once for each instance since it modifies the tokens of the container for each invocation. A client may call this method as many times as necessary (i.e. for multiple values of ``k``) since :obj:``cont`` tokens are modified while retaining the original masked tokens :obj:`masked_tokens`. :param k: as *k* increases the less likely the mask substitutions, and thus sentence; *k* = 0 is the most likely given the sentence and masks """ cont: TokenContainer = self.cont if len(self.df) == 0: raise TransformerError(f'No predictions found for <{cont.text}>') n_top_k: int = len(self) - 1 if k > n_top_k: raise IndexError(f'Only {n_top_k} predictions but asked for {k}') df: pd.DataFrame = self.df df = df[df['k'] == k].sort_values('mask_id') # iterate over the masked tokens, then for each, populate the prediction tok: FeatureToken repl: str for tok, repl in zip(self.masked_tokens, df['token']): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'{repl} -> {tok.norm}') # modify the previously matched token clobbering the norm for each # iteration tok.norm = repl # clear to force a container level norm to be generated cont.clear() return cont
[docs] def get_tokens(self) -> Iterable[TokenPrediction]: """Return an iterable of the prediction coupled with the token it belongs to and its score. """ preds: Iterable[Tuple[str, float]] = self.df.\ sort_values('mask_id')['token score'.split()].\ itertuples(name=None, index=False) return map(lambda t: TokenPrediction(t[0], t[1][0], t[1][1]), zip(self.masked_tokens, preds))
@property def masked_token_dicts(self) -> Tuple[Dict[str, Any]]: """A tuple of :class:`.builtins.dict` each having token index, norm and text data. """ feats: Tuple[str] = ('i', 'idx', 'i_sent', 'norm', 'text') return tuple(map(lambda t: t.get_features(feats), self.masked_tokens))
[docs] def write(self, depth: int = 0, writer: TextIOBase = sys.stdout, include_masked_tokens: bool = True, include_predicted_tokens: bool = True, include_predicted_sentences: bool = True): self._write_line(f'source: {self.cont.text}', depth, writer) if include_masked_tokens: self._write_line('masked:', depth, writer) for mt in self.masked_token_dicts: self._write_dict(mt, depth + 1, writer, one_line=True) if include_predicted_tokens: self._write_line('predicted:', depth, writer) for k, df in self.df.groupby('k')['mask_id token score'.split()]: scs: List[str] = [] for mid, r in df.groupby('mask_id'): s = f"{r['token'].item()} ({r['score'].item():.4f})" scs.append(s) self._write_line(f'k={k}: ' + ', '.join(scs), depth + 1, writer) if include_predicted_sentences: self._write_line('sentences:', depth, writer) self._write_iterable(tuple(map(lambda t: t.norm, self)), depth + 1, writer)
def _from_dictable(self, *args, **kwargs): return OrderedDict( [['source', self.cont.text], ['masked_tokens', self.masked_token_dicts], ['pred_tokens', self.df.to_dict('records')], ['pred_sentences', tuple(map(lambda t: t.norm, self))]]) def __iter__(self) -> Iterable[TokenContainer]: return map(self.get_container, range(len(self))) def __getitem__(self, i: int) -> TokenContainer: return self.get_container(i) def __len__(self) -> int: return len(self.df['k'].drop_duplicates()) def __str__(self) -> str: return self.get_container().norm
[docs] @dataclass class MaskFiller(object): """The class fills masked tokens with the prediction of the underlying maked model. Masked tokens with attribute :obj:`feature_id` having value :obj:`feature_value` (:obj:`~zensols.nlp.tok.FeatureToken.norm` and ``MASK`` by default respectively) are substituted with model values. To use this class, parse a sentence with a :class:`~zensols.nlp.parser.FeatureDocumentParser` with masked tokens using the string :obj:`feature_value`. For example (with class defaults), the sentence:: Paris is the MASK of France. becomes:: Parise is the <mask> of France. The ``<mask>`` string becomes the :obj:`~transformers.PreTrainedTokenizer.mask_token` for the model's tokenzier. """ resource: TransformerResource = field() """A container class with the Huggingface tokenizer and model.""" k: int = field(default=1) """The number of top K predicted masked words per mask. The total number of predictions will be <number of masks> X ``k`` in the source document. """ feature_id: str = field(default='norm') """The :class:`~zensols.nlp.FeatureToken` feature ID to match on masked tokens. :see: :obj:`feature_value` """ feature_value: str = field(default='MASK') """The value of feature ID :obj:`feature_id` to match on masked tokens.""" def _predict(self, text: str) -> pd.DataFrame: tc: TorchConfig = self.resource.torch_config # models are created in the resource tokenizer: PreTrainedTokenizer = self.resource.tokenizer model: PreTrainedModel = self.resource.model # rows of the dataframe are the k, nth mask tok, token str, score/proba rows: List[Tuple[int, int, str, float]] = [] # tokenization produces the vocabulary wordpiece ids input_ids: Tensor = tc.to(tokenizer.encode(text, return_tensors='pt')) # get the wordpiece IDs of the masks mask_token_index: Tensor = torch.where( input_ids == tokenizer.mask_token_id)[1] # predict and get the masked wordpiece token logits token_logits: Tensor = model(input_ids)[0] mask_token_logits: Tensor = token_logits[0, mask_token_index, :] mask_token_logits = torch.softmax(mask_token_logits, dim=1) # get the top K matches based on the masked token logits top: topk = torch.topk(mask_token_logits, k=self.k, dim=1) # iterate over masks top_ix: Tensor = top.indices mix: int for mix in range(top_ix.shape[0]): top_tokens = zip(top_ix[mix].tolist(), top.values[mix].tolist()) token_id: int score: float # iterate over the top K tokens for k, (token_id, score) in enumerate(top_tokens): token: str = tokenizer.decode([token_id]).strip() rows.append((k, mix, token, score)) return pd.DataFrame(rows, columns='k mask_id token score'.split())
[docs] def predict(self, source: TokenContainer) -> Prediction: """Predict subtitution values for token masks. **Important:** ``source`` is modified as a side-effect of this method. Use :meth:`~zensols.nlp.TokenContainer.clone` on the ``source`` document passed to this method to preserve the original if necessary. :param source: the source document, sentence, or span for which to substitute values """ mask_tok: PreTrainedTokenizer = self.resource.tokenizer.mask_token fid: str = self.feature_id fval: str = self.feature_value # identify the masked tokens masked_tokens: Tuple[FeatureToken] = tuple(filter( lambda t: getattr(t, fid) == fval, source.token_iter())) # substitute the tokenizer's token mask needed for prediction tok: FeatureToken for tok in masked_tokens: tok.norm = mask_tok # clear to force a new norm with the tokenzier mask pattern source.clear() df: pd.DataFrame = self._predict(source.norm) return Prediction(source, masked_tokens, df)