Source code for zensols.nlp.nerscore

"""Wraps the `SemEval-2013 Task 9.1`_ NER evaluation API as a
:class:`~zensols.nlp.score.ScoreMethod`.

From the `David Batista`_ blog post:

  The SemEval’13 introduced four different ways to measure
  precision/recall/f1-score results based on the metrics defined by MUC:

    * *Strict*: exact boundary surface string match and entity type

    * *Exact*: exact boundary match over the surface string, regardless of the
      type

    * *Partial*: partial boundary match over the surface string, regardless of
      the type

    * *Type*: some overlap between the system tagged entity and the gold
      annotation is required

  Each of these ways to measure the performance accounts for correct, incorrect,
  partial, missed and spurious in different ways. Let’s look in detail and see
  how each of the metrics defined by MUC falls into each of the scenarios
  described above.


:see: `SemEval-2013 Task 9.1 <https://web.archive.org/web/20150131105418/https://www.cs.york.ac.uk/semeval-2013/task9/data/uploads/semeval_2013-task-9_1-evaluation-metrics.pdf>`_

:see: `David Batista <http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/>`_

"""
from __future__ import annotations
__author__ = 'Paul Landes'
from typing import (
    Tuple, Dict, Set, List, Optional, Any, Iterable, ClassVar, Type
)
from dataclasses import dataclass, field, fields
import numpy as np
from zensols.nlp import TokenContainer, FeatureSpan
from zensols.nlp.score import (
    Score, ErrorScore, ScoreMethod, ScoreContext, HarmonicMeanScore
)


[docs] @dataclass class SemEvalHarmonicMeanScore(HarmonicMeanScore): """A harmonic mean score with the additional SemEval computed scores (see module :mod:`zensols.nlp.nerscore` docs). """ NAN_INSTANCE: ClassVar[SemEvalHarmonicMeanScore] = None correct: int = field() """The number of correct (COR): both are the same.""" incorrect: int = field() """The number of incorrect (INC): the output of a system and the golden annotation don’t match. """ partial: int = field() """The number of partial (PAR): system and the golden annotation are somewhat “similar” but not the same. """ missed: int = field() """The number of missed (MIS): a golden annotation is not captured by a system.""" spurious: int = field() """The number of spurious (SPU): system produces a response which does not exist in the golden annotation. """ possible: int = field() actual: int = field()
SemEvalHarmonicMeanScore.NAN_INSTANCE = SemEvalHarmonicMeanScore( *[np.nan] * 10)
[docs] @dataclass class SemEvalScore(Score): """Contains all four harmonic mean SemEval scores (see module :mod:`zensols.nlp.nerscore` docs). This score has four harmonic means providing various levels of accuracy. """ NAN_INSTANCE: ClassVar[SemEvalScore] = None strict: SemEvalHarmonicMeanScore = field() """Exact boundary surface string match and entity type.""" exact: SemEvalHarmonicMeanScore = field() """Exact boundary match over the surface string, regardless of the type.""" partial: SemEvalHarmonicMeanScore = field() """Partial boundary match over the surface string, regardless of the type. """ ent_type: SemEvalHarmonicMeanScore = field() """Some overlap between the system tagged entity and the gold annotation is required. """
[docs] def asrow(self, meth: str) -> Dict[str, float]: row: Dict[str, Any] = {} f: field for f in fields(self): score: Score = getattr(self, f.name) row.update(score.asrow(f'{meth}_{f.name}')) return row
SemEvalScore.NAN_INSTANCE = SemEvalScore( partial=SemEvalHarmonicMeanScore.NAN_INSTANCE, strict=SemEvalHarmonicMeanScore.NAN_INSTANCE, exact=SemEvalHarmonicMeanScore.NAN_INSTANCE, ent_type=SemEvalHarmonicMeanScore.NAN_INSTANCE)
[docs] @dataclass class SemEvalScoreMethod(ScoreMethod): """A Semeval-2013 Task 9.1 score (see module :mod:`zensols.nlp.nerscore` docs). This score has four harmonic means providing various levels of accuracy. Sentence pairs are ordered as ``(<gold>, <prediction>)``. """ labels: Optional[Set[str]] = field(default=None) """The NER labels on which to evaluate. If not provided, text is evaluated under a (stubbed tag) label. """ @classmethod def _get_external_modules(cls: Type) -> Tuple[str, ...]: return ('nervaluate',) def _score_pair(self, gold: TokenContainer, pred: TokenContainer) -> \ SemEvalScore: from nervaluate import Evaluator def nolab(c: TokenContainer, label: str) -> Tuple[Dict[str, Any], ...]: return tuple(map( lambda t: dict(label=label, start=t.lexspan.begin, end=t.lexspan.end), c.token_iter())) def withlab(c: TokenContainer) -> Tuple[Dict[str, Any]]: ent_set: List[Tuple[Dict[str, Any], ...], ...] = [] ent: FeatureSpan for ent in c.entities: ents: Tuple[Dict[str, Any], ...] = tuple(map( lambda t: dict(label=t.ent_, start=t.lexspan.begin, end=t.lexspan.end), ent)) ent_set.append(ents) return tuple(ent_set) tags: Tuple[str, ...] gold_ents: Tuple[Dict[str, Any], ...] pred_ents: Tuple[Dict[str, Any], ...] if self.labels is None: label: str = '_' gold_ents, pred_ents = nolab(gold, label), nolab(pred, label) gold_ents, pred_ents = (gold_ents,), (pred_ents,) tags = (label,) else: gold_ents, pred_ents = withlab(gold), withlab(pred) tags = tuple(self.labels) evaluator = Evaluator(gold_ents, pred_ents, tags=tags) res: Dict[str, Any] = evaluator.evaluate()[0] hscores: Dict[str, SemEvalHarmonicMeanScore] = {} k: str hdat: Dict[str, float] for k, hdat in res.items(): hdat['f_score'] = hdat.pop('f1') hscores[k] = (SemEvalHarmonicMeanScore(**hdat)) return SemEvalScore(**hscores) def _score(self, meth: str, context: ScoreContext) -> \ Iterable[SemEvalScore]: gold: TokenContainer pred: TokenContainer for gold, pred in context.pairs: try: yield self._score_pair(gold, pred) except Exception as e: yield ErrorScore(meth, e, SemEvalScore.NAN_INSTANCE)