Source code for zensols.deeplearn.model.format

"""Contains a class to write performance metrics.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Dict, Iterable, Any, Union
from dataclasses import dataclass, field
import logging
import sys
import re
from io import TextIOBase
from pathlib import Path
import yaml
import pandas as pd
from zensols.config import Writable
from zensols.deeplearn.model import ModelFacade
from zensols.deeplearn.result import (
    ClassificationMetrics, PredictionsDataFrameFactory,
    ModelResultError, ModelResultManager, ModelResultReporter,
)

logger = logging.getLogger(__name__)


[docs] @dataclass class PerformanceMetricsDumper(Writable): """Formats performance metrics, which can be used in papers. :see: :class:`.LatexPerformanceMetricsDumper` """ facade: ModelFacade = field() """The facade used to fetch previously written results.""" summary_columns: Tuple[str] = field( default=tuple('mF1t mPt mRt MF1t MPt MRt'.split())) """The columns used in the summary report.""" by_label_columns: Tuple[str] = field( default=tuple('mF1 mP mR MF1 MP MR acc count'.split())) """The columns used in the by-label report.""" name_replace: Tuple[str, str] = field(default=None) """If provided, a tuple of ``(regular expression, replacement)`` string given to :func:`re.sub` in the name column of generated tables. """ sort_column: str = field(default='mF1') """The column to sort, with the exception of the majority label, which is always first. """ majority_label_res_id: Union[str, bool] = field(default=True) """Indicates how to create (if any) the majority label performance metrics. If a string, use as the result id (``res_id``) of previous result set used to compute the majority label statitics to include in the summary. If ``True`` use the results from the last tested model. If ``None`` the majority label is not added. """ precision: int = field(default=3) """The number of signification digits to format results."""
[docs] @staticmethod def format_thousand(x: int, apply_k: bool = True, add_comma: bool = True) -> str: add_k = False if x > 10000: if apply_k: x = round(x / 1000) add_k = True if add_comma: x = f'{x:,}' else: x = str(x) if add_k: x += 'K' return x
[docs] @staticmethod def capitalize(name: str) -> str: return ' '.join(map(lambda s: s.capitalize(), re.split(r'[ _-]', name)))
@staticmethod def _map_col(col: str) -> str: desc = ModelResultReporter.METRIC_DESCRIPTIONS.get(col) if desc is not None: return f'{col} is the {desc}' def _map_name(self, name: str) -> str: m: re.Match = re.match(r'^(.+): (\d+)$', name) if m is None: raise ModelResultError(f'Unknown model name format: {name}') run_idx = int(m.group(2)) if run_idx != 1: raise ModelResultError( f'Multiple runs not supported: {name} ({run_idx})') name = m.group(1) if self.name_replace is not None: name = re.sub(*self.name_replace, name) return name @property def summary_dataframe(self) -> pd.DataFrame: pcols = list(self.summary_columns) rcols = list(map(lambda x: x[:-1], pcols)) rm: ModelResultManager = self.facade.result_manager reporter = ModelResultReporter(rm) reporter.include_validation = False df: pd.DataFrame = reporter.dataframe df = df[['name'] + pcols] df = df.rename(columns=dict(zip(pcols, rcols))) if self.sort_column is not None: df = df.sort_values(self.sort_column) df['name'] = df['name'].apply(self._map_name) if self.majority_label_res_id is not None: params = {} if isinstance(self.majority_label_res_id, str): params['name'] = self.majority_label_res_id pred_factory: PredictionsDataFrameFactory = \ self.facade.get_predictions_factory(**params) mets: ClassificationMetrics = pred_factory.majority_label_metrics majlab = pred_factory.metrics_to_series('Majority Label', mets) majlab = majlab.rename({ PredictionsDataFrameFactory.LABEL_COL: 'name'}) dfm = pd.DataFrame([majlab[['name'] + rcols]]) df = pd.concat((dfm, df), ignore_index=True) fmt = '{x:.%sf}' % self.precision for c in rcols: df[c] = df[c].apply(lambda x: fmt.format(x=x)) df = df.rename(columns={'name': 'Name'}) return df def _get_best_results(self) -> pd.DataFrame: rm: ModelResultManager = self.facade.result_manager reporter = ModelResultReporter(rm) reporter.include_validation = False df: pd.DataFrame = reporter.dataframe ix = df['wF1t'].idxmax() name, file_name = df.loc[ix, ['name', 'file']] df = self.facade.get_predictions_factory( name=file_name).metrics_dataframe return df @property def by_label_dataframe(self) -> pd.DataFrame: cols = list(self.by_label_columns) df: pd.DataFrame = self._get_best_results().copy() df = df[['label'] + cols] fmt = '{x:.%sf}' % self.precision for c in cols: if c == 'count': continue df[c] = df[c].apply(lambda x: fmt.format(x=x)) crenames = dict(map(lambda c: (c, self.capitalize(c)), 'label correct acc count'.split())) df = df.rename(columns=crenames) if self.sort_column is not None: col = self.sort_column if self.sort_column == 'name': col = 'label' df = df.sort_values(col) return df
[docs] def write(self, depth: int = 0, writer: TextIOBase = sys.stdout, indent: int = 0): from tabulate import tabulate self._write_line('summary:', depth, writer) df = self.summary_dataframe content = tabulate(df, headers=df.columns, disable_numparse=True) self._write_block(content, depth + indent, writer) self._write_empty(writer) self._write_line('label:', depth, writer) df = self.by_label_dataframe content = tabulate(df, headers=df.columns, disable_numparse=True) self._write_block(content, depth + indent, writer)
def __call__(self): self.write()
[docs] @dataclass class LatexPerformanceMetricsDumper(PerformanceMetricsDumper): """Writes model performance metrics in data formats then used to import to the LaTeX typesetting system used by the Zensols build framework. The class writes a YAML configuration used by `mklatextbl.py` script in the Zensols Build repo, which generates a LaTeX table. The output is a ``.sty` style file with the table, which is included with ``usepackage`` and then added with a command. :see: `Zensols Build <https://github.com/plandes/zenbuild>`_ :see: `mklatextbl.py <https://github.com/plandes/zenbuild/blob/master/bin/mklatextbl.py>`_ """ results_dir: Path = field(default=Path('results/perf')) """The path to the output CSV files with performance metrics.""" config_dir: Path = field(default=Path('../config')) """The path to the YAML configuration files used by the ``mklatextbl.py`` Zensols LaTeX table generator. """ def _create_table(self, name: str, output_csv: Path, caption: str, cols: Iterable[str]) -> Dict[str, Any]: desc = ', '.join(filter(lambda x: x is not None, map(self._map_col, cols))) return { f'metrics{name}tab': {'path': f'../model/{output_csv}', # 'type': 'slack', # 'slack_col': 0, 'caption': caption.format(**dict(desc=desc)), 'placement': 'VAR', 'size': 'small', 'single_column': False, 'uses': 'zentable'}}
[docs] def dump_summary(self) -> Tuple[Path, Path]: """Dump summary of metrics to a LaTeX mktable YAML and CSV files. :return: a tuple of the output CSV and YAML files """ output_csv: Path = self.results_dir / 'metrics-summary.csv' output_yml: Path = self.config_dir / 'metrics-summary-table.yml' df = self.summary_dataframe caption = 'Summarization of performance metrics where {desc}.' rcols = df.columns.to_list()[1:] table_def = self._create_table('summary', output_csv, caption, rcols) for path in (output_csv, output_yml): path.parent.mkdir(parents=True, exist_ok=True) with open(output_yml, 'w') as f: yaml.dump(table_def, f) logger.info(f'wrote: {output_yml}') df.to_csv(output_csv, index=False) logger.info(f'wrote: {output_csv}') return (output_csv, output_yml)
[docs] def dump_by_label(self) -> Tuple[Path, Path]: """Dump per label of metrics of the highest performing model to a LaTeX mktable YAML and CSV files. """ output_csv: Path = self.results_dir / 'metrics-by-label.csv' output_yml: Path = self.config_dir / 'metrics-by-label-table.yml' df = self.by_label_dataframe caption = 'By label performance metrics where {desc}.' cols = self.by_label_columns table_def = self._create_table('label', output_csv, caption, cols) for path in (output_csv, output_yml): path.parent.mkdir(parents=True, exist_ok=True) with open(output_yml, 'w') as f: yaml.dump(table_def, f) logger.info(f'wrote: {output_yml}') df.to_csv(output_csv, index=False) logger.info(f'wrote: {output_csv}') return (output_csv, output_yml)
def __call__(self): self.dump_summary() self.dump_by_label()