"""A utility class to summarize all results in a directory.
"""
__author__ = 'Paul Landes'
from typing import Dict, Tuple, ClassVar
from dataclasses import dataclass, field
from pathlib import Path
import logging
import pandas as pd
from zensols.util.time import time
from zensols.datdesc import DataFrameDescriber
from zensols.deeplearn import DatasetSplitType
from . import (
ModelResult, EpochResult, DatasetResult, ModelResultManager, ArchivedResult,
Metrics, PredictionsDataFrameFactory,
)
logger = logging.getLogger(__name__)
[docs]
@dataclass
class ModelResultReporter(object):
"""Summarize all results in a directory from the output of model execution
from :class:`~zensols.deeplearn.model.ModelExectuor`.
The class iterates through the pickled binary output files from the run and
summarizes in a Pandas dataframe, which is handy for reporting in papers.
"""
METRIC_DESCRIPTIONS: ClassVar[Dict[str, str]] = \
PredictionsDataFrameFactory.METRIC_DESCRIPTIONS
"""Dictionary of performance metrics column names to human readable
descriptions.
"""
result_manager: ModelResultManager = field()
"""Contains the results to report on--and specifically the path to directory
where the results were persisted.
"""
include_validation: bool = field(default=True)
"""Whether or not to include validation performance metrics."""
@property
def dataframe(self) -> pd.DataFrame:
"""Return the summarized results (see class docs).
:return: the Pandas dataframe of the results
"""
rows = []
cols = 'name file start train_duration converged features'.split()
cols.extend('wF1t wPt wRt mF1t mPt mRt MF1t MPt MRt acct'.split())
if self.include_validation:
cols.extend('wF1v wPv wRv mF1v mPv mRv MF1v MPv MRv accv'.split())
cols.extend('train_occurs validation_occurs test_occurs'.split())
dpt_key = 'n_total_data_points'
arch_res: ArchivedResult
for fname, arch_res in self.result_manager.results_stash.items():
res: ModelResult = arch_res.model_result
train: DatasetResult = res.dataset_result.get(
DatasetSplitType.train)
validate: DatasetResult = res.dataset_result.get(
DatasetSplitType.validation)
test: DatasetResult = res.dataset_result.get(DatasetSplitType.test)
if train is not None:
dur = train.end_time - train.start_time
hours, remainder = divmod(dur.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
dur = f'{hours:02}:{minutes:02}:{seconds:02}'
if validate is not None:
conv_epoch: int = validate.statistics['n_epoch_converged']
ver: EpochResult = validate.converged_epoch
else:
conv_epoch = None
ver: EpochResult = None
if test is not None:
vm: Metrics = ver.metrics
tm: Metrics = test.metrics
features = ', '.join(res.decoded_attributes)
row = [res.name, fname, train.start_time, dur,
conv_epoch, features]
row.extend([
tm.weighted.f1, tm.weighted.precision, tm.weighted.recall,
tm.micro.f1, tm.micro.precision, tm.micro.recall,
tm.macro.f1, tm.macro.precision, tm.macro.recall,
tm.accuracy])
if self.include_validation:
row.extend([
vm.weighted.f1, vm.weighted.precision,
vm.weighted.recall,
vm.micro.f1, vm.micro.precision, vm.micro.recall,
vm.macro.f1, vm.macro.precision, vm.macro.recall,
vm.accuracy])
row.extend([
train.statistics[dpt_key], validate.statistics[dpt_key],
test.statistics[dpt_key]])
rows.append(row)
if logger.isEnabledFor(logging.INFO):
logger.info('result calculation complete for ' +
f'{res.name} ({fname})')
return pd.DataFrame(rows, columns=cols)
@property
def dataframe_describer(self) -> DataFrameDescriber:
"""Get a dataframe describer of metrics (see :obj:`metrics_dataframe`).
"""
df: pd.DataFrame = self.dataframe
meta: Tuple[Tuple[str, str], ...] = \
tuple(map(lambda c: (c, self.METRIC_DESCRIPTIONS[c]), df.columns))
name: str = (self.result_manager.name.capitalize() +
' Summarized Model Results')
return DataFrameDescriber(
name='Summarized Model Results',
df=df,
desc=name,
meta=meta)
[docs]
def dump(self, path: Path) -> pd.DataFrame:
"""Create the summarized results and write them to the file system.
"""
with time(f'wrote results summary: {path}'):
df: pd.DataFrame = self.dataframe
df.to_csv(path)
return df