Source code for zensols.deeplearn.result.hypsig

"""Model hypothesis significance testing.  This module has a small framework for
the hypothesis testing the model results (typically the results from the test
dataset).  The outcome of disproving the null hypothesis (which is that two
classifiers perform the same) means that a classifier has statistically
significant better (or worse) performance compared to a second.

"""
__author__ = 'Paul Landes'

from typing import (
    Set, Tuple, List, Sequence, Dict, Any, Iterable, Type, ClassVar
)
from dataclasses import dataclass, field
from abc import ABCMeta, abstractmethod
import sys
import logging
import math
from itertools import chain
from io import TextIOBase
import numpy as np
import pandas as pd
from zensols.util import APIError
from zensols.persist import persisted
from zensols.deeplearn.dataframe import DataFrameDictable
from zensols.datdesc import DataFrameDescriber

logger = logging.getLogger(__name__)



[docs]
class SignificanceError(APIError):
    """Raised for inconsistent or bad data while testing significance."""
    pass




[docs]
@dataclass
class Evaluation(DataFrameDictable):
    """An evaluation metric returned by an implementation of
    :class:`.SignificanceTest`.

    """
    _DICTABLE_ATTRIBUTES: ClassVar[Set[str]] = frozenset({'disprove_null_hyp'})

    pvalue: float = field()
    """The probabily value (p-value)."""

    alpha: float = field()
    """Independency threshold for asserting the null hypothesis."""

    statistic: float = field(default=None)
    """A method specific statistic."""

    @property
    def disprove_null_hyp(self) -> bool:
        """Whether the evaluation shows the test disproves the null hypothesis.

        """
        return self.pvalue < self.alpha

    def _write_key_value(self, k: Any, v: Any, depth: int, writer: TextIOBase):
        if isinstance(v, float):
            v = f'{v:e}'
        self._write_line(f'{k}: {v}', depth, writer)




[docs]
@dataclass
class SignificanceTestData(DataFrameDictable):
    """Metadata needed to create significance tests.

    :see: :class:`.SignificanceTest`.

    """
    a: pd.DataFrame = field()
    """Test set results from the first model."""

    b: pd.DataFrame = field()
    """Test set results from the second model."""

    id_col: str = field(default='id')
    """The dataset column that contains the unique identifier of the data point.
    If this is not ``None``, an assertion on the id's of :obj:`a` and :obj:`b`
    is performed.

    """
    gold_col: str = field(default='label')
    """The column of the gold label/data."""

    pred_col: str = field(default='pred')
    """The column of the prediction."""

    alpha: float = field(default=0.05)
    """Used to compare with the p-value to disprove the null hypothesis."""

    null_hypothesis: str = field(default=(
        'classifiers have a similar proportion of errors on the test set'))
    """A human readable string of the hypothesis."""

    def _assert_data(self):
        dfa: pd.DataFrame = self.a
        dfb: pd.DataFrame = self.b
        assert len(dfa) == len(dfb)
        if self.id_col is not None:
            if dfa[self.id_col].tolist() != dfb[self.id_col].tolist():
                raise SignificanceError(
                    f"Test result IDs do not match for column '{self.id_col}'")
        if dfa[self.gold_col].tolist() != dfb[self.gold_col].tolist():
            raise SignificanceError(
                f"Test result labels do not match for column '{self.gold_col}'")

    @property
    @persisted('_correct_table')
    def correct_table(self) -> pd.DataFrame:
        """Return a tuple of a dataframe of the correct values in columns
        ``a_correct`` and ``b_correct``.

        """
        dfa: pd.DataFrame = self.a
        dfb: pd.DataFrame = self.b
        # each classifier's correct classification by ID
        return pd.concat(
            (dfa[self.id_col],
             dfa[self.gold_col] == dfa[self.pred_col],
             dfb[self.gold_col] == dfb[self.pred_col]),
            axis=1, keys='id a_correct b_correct'.split())

    @property
    @persisted('_contingency_table')
    def contingency_table(self) -> pd.DataFrame:
        """Return the contingency table using correct columns from
        :obj:`correct_table``.

        """
        dfc: pd.DataFrame = self.correct_table
        df_cont: pd.DataFrame = pd.crosstab(
            dfc['a_correct'], dfc['b_correct'],
            rownames='a'.split(),
            colnames='b'.split())
        return df_cont



# subclass this into a regression/ranking from classification when needed

[docs]
@dataclass
class SignificanceTest(DataFrameDictable, metaclass=ABCMeta):
    """A statistical significance hypothesis test for models using test set data
    results.

    """
    _DICTABLE_ATTRIBUTES: ClassVar[Set[str]] = frozenset({'evaluation'})

    data: SignificanceTestData = field()
    """Contains the data to be used for the significance hypothesis testing."""

    @property
    def name(self) -> str:
        """The name of the test."""
        return self._NAME

    @abstractmethod
    def _compute_significance(self, data: SignificanceTestData) -> Evaluation:
        """Compute the significance of the result ``data``."""
        pass

    @property
    @persisted('_evaluation')
    def evaluation(self) -> Evaluation:
        self.data._assert_data()
        return self._compute_significance(self.data)


[docs]
    def write_conclusion(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        """Write an intuitive explanation of the results.

        :param depth: the starting indentation depth

        :param writer: the writer to dump the content of this writable

        """
        data: SignificanceTestData = self.data
        res: Evaluation = self.evaluation
        disprove_null_hyp: bool = res.disprove_null_hyp
        disprove_str: str = 'disproved' \
            if disprove_null_hyp else 'did not disprove'
        self._write_line(f'{disprove_str} the null hypothesis:', depth, writer)
        self._write_line(f"'{data.null_hypothesis}' is {not disprove_null_hyp}",
                         depth, writer)



[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout,
              include_contingency: bool = True,
              include_conclusion: bool = True):
        if include_contingency:
            self._write_line('contingency:', depth, writer)
            self._write_dataframe(self.data.contingency_table,
                                  depth + 1, writer)
        self._write_line('evaluation:', depth, writer)
        self._write_object(self.evaluation, depth + 1, writer)
        if include_conclusion:
            self._write_line('hypothesis:', depth, writer)
            self.write_conclusion(depth + 1, writer)





[docs]
@dataclass
class SignificanceTestSuite(DataFrameDictable):
    """A suite of significance tests that use one or more
    :class:`.SignificanceTest`.

    """
    _TESTS: ClassVar[Dict[str, Type[SignificanceTest]]] = {}
    """A mapping of all available significance tests."""

    data: SignificanceTestData = field()
    """Contains the data to be used for the significance hypothesis testing."""

    test_names: Tuple[str, ...] = field(default=None)
    """The test names (:obj:`.SignificanceTest.name`) to be in this suite."""

    @classmethod
    def _register_test(cls: Type, test: Type[SignificanceTest]):
        cls._TESTS[test._NAME] = test

    @property
    def available_test_names(self) -> Set[str]:
        """All avilable names of tests (see :obj:`test_names`)."""
        return set(self._TESTS.keys())

    @property
    def tests(self) -> Tuple[SignificanceTest, ...]:
        """The tests used in this suite"""
        def map_test_name(name: str) -> SignificanceTest:
            cls: str = self._TESTS[name]
            return cls(data=self.data)

        test_names: Sequence[str, ...] = self.test_names
        if test_names is None:
            test_names = sorted(self.available_test_names)
        return tuple(map(map_test_name, test_names))

    @property
    def describer(self) -> DataFrameDescriber:
        """A dataframe describer of all significance evaluations."""
        rows: List[Tuple[Any, ...]] = []
        test: SignificanceTest
        for test in self.tests:
            evl: Evaluation = test.evaluation
            rows.append((test.name, evl.pvalue, evl.statistic,
                         evl.disprove_null_hyp))
        return DataFrameDescriber(
            name='significance-tests',
            df=pd.DataFrame(rows, columns='name pvalue stat disprove'.split()),
            desc='Model Result Significance Tests',
            meta=(('name', 'the name of the significance test'),
                  ('pvalue', "the test's resulting p-value"),
                  ('stat', "the test's resulting statistic"),
                  ('disprove', 'if true, the null hypothesis is disproven')))


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        self._write_line('contingency:', depth, writer)
        self._write_dataframe(self.data.contingency_table, depth + 1, writer)
        test: SignificanceTest
        for test in self.tests:
            self._write_line(f'{test.name}:', depth, writer)
            test.write(depth + 1, writer, include_contingency=False)





[docs]
class StudentTTestSignificanceTest(SignificanceTest):
    """Student's T-Test, which measure the difference in the mean.  This test
    violates the independence assumption, but it is included as it is still used
    in papers as a metric.

    Citation:

      `Student (1908)`_ The Probable Error of a Mean. Biometrika, 6(1):1–25.

    .. _Student (1908): https://www.jstor.org/stable/2331554


    """
    _NAME: ClassVar[str] = 'student-ttest'

    def _compute_significance(self, data: SignificanceTestData) -> Evaluation:
        from scipy.stats import ttest_ind
        dfc = data.correct_table
        res = ttest_ind(dfc['a_correct'], dfc['b_correct'])
        return Evaluation(
            pvalue=res.pvalue,
            alpha=data.alpha,
            statistic=res.statistic)



SignificanceTestSuite._register_test(StudentTTestSignificanceTest)



[docs]
class AnovaSignificanceTest(SignificanceTest):
    """One-way ANOVA test."""
    _NAME: ClassVar[str] = 'anova'

    def _compute_significance(self, data: SignificanceTestData) -> Evaluation:
        from scipy.stats import f_oneway
        dfc = data.correct_table
        stat, pvalue = f_oneway(dfc['a_correct'], dfc['b_correct'])
        return Evaluation(
            pvalue=pvalue,
            alpha=data.alpha,
            statistic=stat)



SignificanceTestSuite._register_test(AnovaSignificanceTest)



[docs]
class WilcoxSignificanceTest(SignificanceTest):
    """Wilcoxon signed-rank test, which is a non-parametric version of Student's
    T-Test.

    Citation:

      `Frank Wilcoxon (1945)`_ Individual Comparisons by Ranking
      Methods. Biometrics Bulletin, 1(6):80–83.

    .. _Frank Wilcoxon (1945): https://www.jstor.org/stable/3001968

    """
    _NAME: ClassVar[str] = 'wilcoxon'

    def _compute_significance(self, data: SignificanceTestData) -> Evaluation:
        from scipy.stats import wilcoxon
        dfc = data.correct_table
        a = dfc['a_correct'].apply(lambda x: 1 if x else 0)
        b = dfc['b_correct'].apply(lambda x: 1 if x else 0)
        res = wilcoxon(a, b)
        return Evaluation(
            pvalue=res.pvalue,
            alpha=data.alpha,
            statistic=res.statistic)



SignificanceTestSuite._register_test(WilcoxSignificanceTest)



[docs]
class McNemarSignificanceTest(SignificanceTest):
    """McNemar's test.

    Citation:

      `Quinn McNemar (1947)`_ Note on the sampling error of the difference
      between correlated proportions or percentages. Psychometrika,
      12(2):153–157, June.

    .. _Quinn McNemar (1947): https://doi.org/10.1007/BF02295996

    """
    _NAME: ClassVar[str] = 'mcnemar'

    def _compute_significance(self, data: SignificanceTestData) -> Evaluation:
        from statsmodels.stats.contingency_tables import mcnemar
        dfc: pd.DataFrame = data.correct_table
        df_cont: pd.DataFrame = data.contingency_table
        # Yes/No is the count of test instances that Classifier1 got correct and
        # Classifier2 got incorrect, and No/Yes is the count of test instances
        # that Classifier1 got incorrect and Classifier2 got correct
        yes_no = df_cont.loc[True][False]
        no_yes = df_cont.loc[False][True]
        assert yes_no == len(dfc[dfc['a_correct'] & ~dfc['b_correct']])
        assert no_yes == len(dfc[~dfc['a_correct'] & dfc['b_correct']])
        # compute stat and pvalue
        res = mcnemar(df_cont, exact=False, correction=True)
        return Evaluation(res.pvalue, data.alpha, res.statistic)



SignificanceTestSuite._register_test(McNemarSignificanceTest)



[docs]
@dataclass
class ChiSquareEvaluation(Evaluation):
    """The statistics gathered from :func:`scipy.stats.chi2_contingency` and
    created in :class:`.ChiSquareCalculator`.

    """
    dof: int = field(default=None)
    """Degrees of freedom"""

    expected: np.ndarray = field(default=None)
    """The expected frequencies, based on the marginal sums of the table.  It
    has the same shape as :class:`.ChiSquareCalculator`.observations.

    """
    contingency_table: pd.DataFrame = field(default=None)
    """The contigency table used for the results."""

    @property
    def associated(self) -> bool:
        """Whether or not the variables are assocated (rejection of the null
        hypotheis).

        """
        return self.pvalue <= self.alpha

    @property
    def raw_residuals(self) -> pd.DataFrame:
        """The raw residuals as computed as the difference between the
        observations and the expected cell values.

        """
        return self.contingency_table - self.expected

    @property
    def contribs(self) -> pd.DataFrame:
        """The contribution of each cell to the results of the chi-square
        computation.

        """
        return self.pearson_residuals ** 2

    @property
    def pearson_residuals(self) -> pd.DataFrame:
        """Pearson residuals, aka *standardized* residuals."""
        exp = self.expected
        raw_resid = self.contingency_table - exp
        return raw_resid / np.sqrt(exp)

    @property
    def adjusted_residuals(self) -> pd.DataFrame:
        """The adjusted residuals (see class docs)."""
        obs_df = self.contingency_table
        obs = obs_df.to_numpy()
        exp = self.expected.to_numpy()
        raw_res = self.raw_residuals.to_numpy()
        row_marg = obs.sum(axis=0)
        col_marg = obs.sum(axis=1)
        n = obs.sum()
        arr = np.empty(shape=exp.shape)
        for rix in range(exp.shape[0]):
            for cix in range(exp.shape[1]):
                rm = row_marg[cix]
                cm = col_marg[rix]
                num = raw_res[rix][cix]
                ex = exp[rix][cix]
                mul = ((1. - (rm / n)) * (1. - (cm / n)))
                to_sqrt = ex * mul
                if to_sqrt == 0:
                    logger.warning(f'bad multiplier: xpected={ex}, ' +
                                   f'rm={rm}, cm={cm}, n={n}, mul={mul}')
                denom = math.sqrt(to_sqrt)
                v = num / denom
                arr[rix][cix] = v
        return pd.DataFrame(arr, columns=obs_df.columns, index=obs_df.index)

    def _get_dictable_attributes(self) -> Iterable[Tuple[str, str]]:
        fs = ('associated expected contribs ' +
              'adjusted_residuals pearson_residuals').split()
        return chain.from_iterable(
            [super()._get_dictable_attributes(), map(lambda x: (x, x), fs)])


[docs]
    def write_associated(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        """Write how the variables relate as a result of the chi-square
        computation.

        :see: :meth:`write`

        """
        if self.associated:
            assoc = 'variables are associated (reject H0)'
        else:
            assoc = 'variables are not associated (fail to reject H0)'
        self._write_line(f'associated: {assoc}', depth, writer)



[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        dct = super().asdict()
        for k in ('expected contribs contingency_table ' +
                  'adjusted_residuals pearson_residuals').split():
            del dct[k]
        self._write_dict(dct, depth, writer)
        self._write_line('expected:', depth, writer)
        self._write_dataframe(self.expected, depth + 1, writer)
        self._write_line('contributions:', depth, writer)
        self._write_dataframe(self.contribs, depth + 1, writer)
        self._write_line('pearson_residuals:', depth, writer)
        self._write_dataframe(self.pearson_residuals, depth + 1, writer)
        self._write_line('adjusted_residuals:', depth, writer)
        self._write_dataframe(self.adjusted_residuals, depth + 1, writer)
        self.write_associated(depth, writer)





[docs]
class ChiSquareSignificanceTest(SignificanceTest):
    """A ChiSquare test using the 2x2 contigency table as input.

    """
    _NAME: ClassVar[str] = 'chisquare'

    def _compute_significance(self, data: SignificanceTestData) -> Evaluation:
        from scipy.stats import chi2_contingency
        dfc: pd.DataFrame = data.contingency_table
        chi2, p, dof, expected = chi2_contingency(dfc)
        dfe = pd.DataFrame(expected, columns=dfc.columns, index=dfc.index)
        return ChiSquareEvaluation(
            pvalue=p,
            alpha=data.alpha,
            statistic=chi2,
            dof=dof,
            expected=dfe,
            contingency_table=dfc)



SignificanceTestSuite._register_test(ChiSquareSignificanceTest)