Source code for zensols.deeplearn.model.format

"""Contains a class to write performance metrics.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Dict, Iterable, Any, Union
from dataclasses import dataclass, field
import logging
import sys
import re
from io import TextIOBase
from pathlib import Path
import yaml
import pandas as pd
from zensols.config import Writable
from zensols.deeplearn.model import ModelFacade
from zensols.deeplearn.result import (
    ClassificationMetrics, PredictionsDataFrameFactory,
    ModelResultError, ModelResultManager, ModelResultReporter,
)

logger = logging.getLogger(__name__)



[docs]
@dataclass
class PerformanceMetricsDumper(Writable):
    """Formats performance metrics, which can be used in papers.

    :see: :class:`.LatexPerformanceMetricsDumper`

    """
    facade: ModelFacade = field()
    """The facade used to fetch previously written results."""

    summary_columns: Tuple[str] = field(
        default=tuple('mF1t mPt mRt MF1t MPt MRt'.split()))
    """The columns used in the summary report."""

    by_label_columns: Tuple[str] = field(
        default=tuple('mF1 mP mR MF1 MP MR acc count'.split()))
    """The columns used in the by-label report."""

    name_replace: Tuple[str, str] = field(default=None)
    """If provided, a tuple of ``(regular expression, replacement)`` string
    given to :func:`re.sub` in the name column of generated tables.

    """
    sort_column: str = field(default='mF1')
    """The column to sort, with the exception of the majority label, which is
    always first.

    """
    majority_label_res_id: Union[str, bool] = field(default=True)
    """Indicates how to create (if any) the majority label performance metrics.
    If a string, use as the result id (``res_id``) of previous result set used
    to compute the majority label statitics to include in the summary.  If
    ``True`` use the results from the last tested model.  If ``None`` the
    majority label is not added.

    """
    precision: int = field(default=3)
    """The number of signification digits to format results."""


[docs]
    @staticmethod
    def format_thousand(x: int, apply_k: bool = True,
                        add_comma: bool = True) -> str:
        add_k = False
        if x > 10000:
            if apply_k:
                x = round(x / 1000)
                add_k = True
        if add_comma:
            x = f'{x:,}'
        else:
            x = str(x)
        if add_k:
            x += 'K'
        return x



[docs]
    @staticmethod
    def capitalize(name: str) -> str:
        return ' '.join(map(lambda s: s.capitalize(),
                            re.split(r'[ _-]', name)))


    @staticmethod
    def _map_col(col: str) -> str:
        desc = ModelResultReporter.METRIC_DESCRIPTIONS.get(col)
        if desc is not None:
            return f'{col} is the {desc}'

    def _map_name(self, name: str) -> str:
        m: re.Match = re.match(r'^(.+): (\d+)$', name)
        if m is None:
            raise ModelResultError(f'Unknown model name format: {name}')
        run_idx = int(m.group(2))
        if run_idx != 1:
            raise ModelResultError(
                f'Multiple runs not supported: {name} ({run_idx})')
        name = m.group(1)
        if self.name_replace is not None:
            name = re.sub(*self.name_replace, name)
        return name

    @property
    def summary_dataframe(self) -> pd.DataFrame:
        pcols = list(self.summary_columns)
        rcols = list(map(lambda x: x[:-1], pcols))
        rm: ModelResultManager = self.facade.result_manager
        reporter = ModelResultReporter(rm)
        reporter.include_validation = False
        df: pd.DataFrame = reporter.dataframe
        df = df[['name'] + pcols]
        df = df.rename(columns=dict(zip(pcols, rcols)))
        if self.sort_column is not None:
            df = df.sort_values(self.sort_column)
        df['name'] = df['name'].apply(self._map_name)
        if self.majority_label_res_id is not None:
            params = {}
            if isinstance(self.majority_label_res_id, str):
                params['name'] = self.majority_label_res_id
            pred_factory: PredictionsDataFrameFactory = \
                self.facade.get_predictions_factory(**params)
            mets: ClassificationMetrics = pred_factory.majority_label_metrics
            majlab = pred_factory.metrics_to_series('Majority Label', mets)
            majlab = majlab.rename({
                PredictionsDataFrameFactory.LABEL_COL: 'name'})
            dfm = pd.DataFrame([majlab[['name'] + rcols]])
            df = pd.concat((dfm, df), ignore_index=True)
        fmt = '{x:.%sf}' % self.precision
        for c in rcols:
            df[c] = df[c].apply(lambda x: fmt.format(x=x))
        df = df.rename(columns={'name': 'Name'})
        return df

    def _get_best_results(self) -> pd.DataFrame:
        rm: ModelResultManager = self.facade.result_manager
        reporter = ModelResultReporter(rm)
        reporter.include_validation = False
        df: pd.DataFrame = reporter.dataframe
        ix = df['wF1t'].idxmax()
        name, file_name = df.loc[ix, ['name', 'file']]
        df = self.facade.get_predictions_factory(
            name=file_name).metrics_dataframe
        return df

    @property
    def by_label_dataframe(self) -> pd.DataFrame:
        cols = list(self.by_label_columns)
        df: pd.DataFrame = self._get_best_results().copy()
        df = df[['label'] + cols]
        fmt = '{x:.%sf}' % self.precision
        for c in cols:
            if c == 'count':
                continue
            df[c] = df[c].apply(lambda x: fmt.format(x=x))
        crenames = dict(map(lambda c: (c, self.capitalize(c)),
                            'label correct acc count'.split()))
        df = df.rename(columns=crenames)
        if self.sort_column is not None:
            col = self.sort_column
            if self.sort_column == 'name':
                col = 'label'
            df = df.sort_values(col)
        return df


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
              indent: int = 0):
        from tabulate import tabulate
        self._write_line('summary:', depth, writer)
        df = self.summary_dataframe
        content = tabulate(df, headers=df.columns, disable_numparse=True)
        self._write_block(content, depth + indent, writer)
        self._write_empty(writer)

        self._write_line('label:', depth, writer)
        df = self.by_label_dataframe
        content = tabulate(df, headers=df.columns, disable_numparse=True)
        self._write_block(content, depth + indent, writer)


    def __call__(self):
        self.write()




[docs]
@dataclass
class LatexPerformanceMetricsDumper(PerformanceMetricsDumper):
    """Writes model performance metrics in data formats then used to import to
    the LaTeX typesetting system used by the Zensols build framework.  The class
    writes a YAML configuration used by `mklatextbl.py` script in the Zensols
    Build repo, which generates a LaTeX table.  The output is a ``.sty` style
    file with the table, which is included with ``usepackage`` and then added
    with a command.

    :see: `Zensols Build <https://github.com/plandes/zenbuild>`_

    :see: `mklatextbl.py <https://github.com/plandes/zenbuild/blob/master/bin/mklatextbl.py>`_

    """
    results_dir: Path = field(default=Path('results/perf'))
    """The path to the output CSV files with performance metrics."""

    config_dir: Path = field(default=Path('../config'))
    """The path to the YAML configuration files used by the ``mklatextbl.py``
    Zensols LaTeX table generator.

    """
    def _create_table(self, name: str, output_csv: Path, caption: str,
                      cols: Iterable[str]) -> Dict[str, Any]:
        desc = ', '.join(filter(lambda x: x is not None,
                                map(self._map_col, cols)))
        return {
            f'metrics{name}tab':
            {'path': f'../model/{output_csv}',
             # 'type': 'slack',
             # 'slack_col': 0,
             'caption': caption.format(**dict(desc=desc)),
             'placement': 'VAR',
             'size': 'small',
             'single_column': False,
             'uses': 'zentable'}}


[docs]
    def dump_summary(self) -> Tuple[Path, Path]:
        """Dump summary of metrics to a LaTeX mktable YAML and CSV files.

        :return: a tuple of the output CSV and YAML files

        """
        output_csv: Path = self.results_dir / 'metrics-summary.csv'
        output_yml: Path = self.config_dir / 'metrics-summary-table.yml'
        df = self.summary_dataframe
        caption = 'Summarization of performance metrics where {desc}.'
        rcols = df.columns.to_list()[1:]
        table_def = self._create_table('summary', output_csv, caption, rcols)
        for path in (output_csv, output_yml):
            path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_yml, 'w') as f:
            yaml.dump(table_def, f)
        logger.info(f'wrote: {output_yml}')
        df.to_csv(output_csv, index=False)
        logger.info(f'wrote: {output_csv}')
        return (output_csv, output_yml)



[docs]
    def dump_by_label(self) -> Tuple[Path, Path]:
        """Dump per label of metrics of the highest performing model to a LaTeX
        mktable YAML and CSV files.

        """
        output_csv: Path = self.results_dir / 'metrics-by-label.csv'
        output_yml: Path = self.config_dir / 'metrics-by-label-table.yml'
        df = self.by_label_dataframe
        caption = 'By label performance metrics where {desc}.'
        cols = self.by_label_columns
        table_def = self._create_table('label', output_csv, caption, cols)
        for path in (output_csv, output_yml):
            path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_yml, 'w') as f:
            yaml.dump(table_def, f)
        logger.info(f'wrote: {output_yml}')
        df.to_csv(output_csv, index=False)
        logger.info(f'wrote: {output_csv}')
        return (output_csv, output_yml)


    def __call__(self):
        self.dump_summary()
        self.dump_by_label()