Source code for zensols.deepnlp.cli

"""Facade application implementations for NLP use.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Any, List
from dataclasses import dataclass, field
import sys
from io import TextIOBase
import logging
from pathlib import Path
import pandas as pd
from zensols.persist import dealloc, Stash
from zensols.config import Settings, Writable
from zensols.cli import ActionCliManager, ApplicationError
from zensols.nlp import FeatureDocument
from zensols.deeplearn import ModelError
from zensols.deeplearn.batch import Batch, DataPoint
from zensols.deeplearn.model import ModelFacade, ModelUnpacker
from zensols.deeplearn.cli import FacadeApplication
from zensols.deepnlp.classify import (
    LabeledFeatureDocumentDataPoint, LabeledFeatureDocument
)

logger = logging.getLogger(__name__)


[docs] @dataclass class NLPFacadeBatchApplication(FacadeApplication): """A facade application for creating mini-batches for training. """ CLI_META = ActionCliManager.combine_meta( FacadeApplication, {'mnemonic_overrides': {'dump_batches': 'dumpbatch'}}) def _add_row(self, split_name: str, batch: Batch, dp: DataPoint): label: str = None text: str = None if isinstance(dp, LabeledFeatureDocumentDataPoint): label = dp.label if hasattr(dp, 'doc') and isinstance(dp.doc, FeatureDocument): doc: FeatureDocument = dp.doc text = doc.text if label is None and \ (isinstance(LabeledFeatureDocument) or hasattr(doc, 'label')): label = doc.label if label is None and hasattr(dp, 'label'): label = dp.label if text is None: text = str(dp) return (batch.id, dp.id, split_name, label, text)
[docs] def dump_batches(self): """Dump the batch dataset with IDs, splits, labels and text. """ rows: List[Any] = [] with dealloc(self.create_facade()) as facade: self._enable_cli_logging(facade) out_csv = Path(f'{facade.model_settings.normal_model_name}.csv') split_name: str ss: Stash for split_name, ss in facade.dataset_stash.splits.items(): batch: Batch for batch in ss.values(): dp: DataPoint for dp in batch.data_points: rows.append(self._add_row(split_name, batch, dp)) df = pd.DataFrame( rows, columns='batch_id data_point_id split label text'.split()) df.to_csv(out_csv) if logger.isEnabledFor(logging.INFO): logger.info(f'wrote {out_csv}')
[docs] @dataclass class NLPFacadeModelApplication(FacadeApplication): """A base class facade application for predicting tokens or text. """ CLI_META = ActionCliManager.combine_meta( FacadeApplication, {'mnemonic_overrides': {'predict_text': 'predict'}, 'option_overrides': {'verbose': {'long_name': 'verbose', 'short_name': None}}}) def _get_sentences(self, text_input: str) -> Tuple[str]: """Read sentences from standard in, or passed command line string ``text_input`` if not ``-``. """ def map_sents(din: TextIOBase): return map(lambda ln: ln.strip(), sys.stdin.readlines()) if text_input == '-': return tuple(map_sents(sys.stdin)) else: return [text_input] def _predict(self, facade: ModelFacade, data: Any) -> Any: try: return facade.predict(data) except ModelError as e: raise ApplicationError( 'Could not predict, probably need to train a model ' + f'first: {e}') from e
[docs] class NLPClassifyFacadeModelApplication(NLPFacadeModelApplication): """A facade application for predicting text (for example sentiment classification tasks). """
[docs] def predict_text(self, text: str, verbose: bool = False): """Classify ``text`` and output the results. :param text: the sentence to classify or standard in a dash (-) :param verbose: if given, print the long format version of the document """ sents = self._get_sentences(text) with dealloc(self.create_facade()) as facade: docs: Tuple[FeatureDocument] = self._predict(facade, sents) for doc in docs: if verbose: doc.write() else: print(doc)
[docs] @dataclass class NLPSequenceClassifyFacadeModelApplication(NLPFacadeModelApplication): """A facade application for predicting tokens (for example NER tasks). """ model_path: Path = field(default=None) """The path to the model or use the last trained model if not provided. """
[docs] def predict_text(self, text: str, verbose: bool = False): """Classify ``text`` and output the results. :param text: the sentence to classify or standard in a dash (-) :param verbose: if given, print the long format version of the document """ sents: Tuple[str] = self._get_sentences(text) with dealloc(self.create_facade()) as facade: pred: Settings = self._predict(facade, sents) docs: Tuple[FeatureDocument] = pred.docs classes: Tuple[str] = pred.classes for labels, doc in zip(classes, docs): for label, tok in zip(labels, doc.token_iter()): print(label, tok)
[docs] @dataclass class NLPClassifyPackedModelApplication(object): """Classifies data used a packed model. The :obj:`unpacker` is used to install the model (if not already), then provide access to it. A :class:`~zensols.deeplearn.model.facade.ModelFacade` is created from packaged model that is downloaded. The model then uses the facade's :meth:`zensols.deeplearn.model.facade.ModelFacade.predict` method to output the predictions. """ CLI_META = { 'option_excludes': {'unpacker'}, 'option_overrides': { 'text_or_file': {'long_name': 'input', 'metavar': '<TEXT|FILE>'}, 'verbose': {'short_name': None}}, 'mnemonic_excludes': {'predict'}, 'mnemonic_overrides': { 'write_predictions': 'predict', # careful of 'info' name collision in FacadeInfoApplication; # override with something shorter in subclass decorator 'write_model_info': 'modelstat'}} unpacker: ModelUnpacker = field() """The model source.""" @property def facade(self) -> ModelFacade: """The packaged model's facade.""" return self.unpacker.facade
[docs] def predict(self, sents: Tuple[str]) -> Tuple[Any]: """Predcit sentiment for each sentence in ``sents``.""" return self.facade.predict(sents)
[docs] def write_predictions(self, text_or_file: str, verbose: bool = False): """Predict sentement of sentence(s). :param text_or_file: newline delimited file of sentences or a sentence :param verbose: write verbose prediction output """ sents: Tuple[str] = text_or_file, path = Path(text_or_file) if path.is_file(): with open(path) as f: sents = tuple(map(str.strip, f.readlines())) try: for pred in self.predict(sents): if verbose: if isinstance(pred, Writable): pred.write() else: print(repr(pred)) else: print(pred) except BrokenPipeError: # don't complain for UNIX pipe (i.e. head) pass
[docs] def write_model_info(self): """Write the model information and metrics.""" self.unpacker.write()