Source code for zensols.calamr.resource

"""Client facade access to annotated AMR documents and alignment.

"""
__author__ = 'Paul Landes'
from typing import Dict, Sequence, Optional, Type, Union
from dataclasses import dataclass, field
import logging
import traceback
from pathlib import Path
from zensols.util import APIError
from zensols.persist import Stash
from zensols.amr import AmrFeatureDocument
from zensols.amr.serial import AmrSerializedFactory
from . import (
    DocumentGraph, FlowGraphResult, DocumentGraphFactory, DocumentGraphAligner
)

logger = logging.getLogger(__name__)


class _corpus_resource(object):
    def __init__(self, resource: 'Resource'):
        self._resource = resource

    def __enter__(self) -> Stash:
        return self._resource

    def __exit__(self, cls: Type[Exception], value: Optional[Exception],
                 trace: traceback):
        if value is not None:
            raise value


class _adhoc_resource(object):
    def __init__(self, resource: 'Resource',
                 corpus: Sequence[Dict[str, str]],
                 corpus_id: str = None, clear: bool = False):
        self._resource = resource
        self._doc_stash: 'AdhocAnnotatedAmrDocumentStash' = resource.documents
        self._clear = clear
        self._corpus = corpus
        self._corpus_id = corpus_id
        logger.info('setting corpus')
        self._doc_stash.set_corpus(self._corpus, self._corpus_id)

    def __enter__(self) -> Stash:
        logger.info('priming corpus')
        self._doc_stash.prime()
        return self._resource

    def __exit__(self, cls: Type[Exception], value: Optional[Exception],
                 trace: traceback):
        try:
            if self._clear:
                logger.info('clearing corpus')
                self._doc_stash.clear()
        except Exception as e:
            logger.error(f'Could not clear stash in {self.__class__}: {e}',
                         exc_info=True)
        try:
            logger.info('restoring corpus')
            self._doc_stash.restore()
        except Exception as e:
            logger.error(f'Could not restore state in {self.__class__}: {e}',
                         exc_info=True)
        if value is not None:
            raise value



[docs]
@dataclass
class Resource(object):
    """Contains objects that parse AMR annotated documents and align them.
    Instance of this class are created with :class:`.Resources`.

    """
    documents: Stash = field()
    """A stash (:class:`dict` like) collection with AMR doc IDs keys to
    :class:`~zensols.amr.container.AmrFeatureDocument` values.

    """
    alignments: Stash = field()
    """A stash (:class:`dict` like) collection with AMR doc IDs keys to
    :class:`~zensols.calamr.flow.FlowGraphResult` values.

    """
    _resources: 'Resources' = field()
    """The object that created this instance."""


[docs]
    def align(self, doc: Union[AmrFeatureDocument, str],
              render_level: int = None, directory: Path = None) -> \
            FlowGraphResult:
        """Align a document, which allows for the rendering of a document since
        it does not use the cached results from :obj:`alignments`.

        :param doc: either the unique document ID or a unique document ID that
                    indicates which document to align

        :param render_level: how many graphs to render (0 - 10), higher means
                             more

        :param directory: the output directory

        """
        graph_factory: DocumentGraphFactory = self._resources.doc_graph_factory
        graph_aligner: DocumentGraphAligner = self._resources.doc_graph_aligner
        doc: AmrFeatureDocument = self.documents[doc] \
            if isinstance(doc, str) else doc
        doc_graph: DocumentGraph = graph_factory(doc)
        prev_render_level: int = graph_aligner.render_level
        prev_output_dir: Path = graph_aligner.output_dir
        try:
            if render_level is not None:
                graph_aligner.render_level = render_level
            if directory is not None:
                graph_aligner.output_dir = directory
            return graph_aligner.align(doc_graph)
        finally:
            graph_aligner.render_level = prev_render_level
            graph_aligner.output_dir = prev_output_dir





[docs]
@dataclass
class Resources(object):
    """A client facade (GoF) for Calamr annotated AMR corpus access and
    alginment.  This object is used as a context manager.

    The :meth:`corpus` and :meth:`adhoc` methods provide access to documents and
    alignments.  Use the stashes provided by those methods to clear respective
    cached data.

    :see: :class:`.AdhocAnnotatedAmrDocumentStash`

    """
    serialized_factory: AmrSerializedFactory = field()
    """Creates a :class:`.Serialized` from :class:`.AmrDocument`,
    :class:`.AmrSentence` or :class:`.AnnotatedAmrDocument`.

    """
    doc_graph_factory: DocumentGraphFactory = field()
    """Create document graphs."""

    doc_graph_aligner: DocumentGraphAligner = field()
    """Align document graphs."""

    _anon_doc_stash: Stash = field()
    """Contains human annotated AMRs.  This could be from the adhoc (micro)
    corpus (small toy corpus), AMR 3.0 Proxy Report corpus, Little Prince, or
    the Bio AMR corpus.

    """
    _adhoc_doc_stash: Stash = field()
    """A :class:`~zensols.calamr.adhoc.AdhocAnnotatedAmrDocumentStash` instance.
    It is used generate documents without setting up a corpus.

    """
    _flow_results_stash: Stash = field()
    """Creates cached instances of :class:`.FlowGraphResult`."""


[docs]
    def corpus(self) -> Resource:
        """Return a context manager for corpus access.  A corpus must be created
        before using this method, which amounts to using an AMR parser to create
        the parenthetical text files.  These files are then made available as
        resource to be downloaded or available on the file system.

        Example:

        .. code-block:: python

           from zensols.calamr import Resources, ApplicationFactory

           resources: Resources = ApplicationFactory.get_resources()

           with resources.corpus() as r:
               # print the keys of the annotated AMR documents
               print(tuple(r.documents.keys()))
               # determine if a document is in the stash
               print('some_key' in r.documents)
               # write an AMR document
               r.documents['some_key'].write()

        """
        return _corpus_resource(
            resource=Resource(
                documents=self._anon_doc_stash,
                alignments=self._flow_results_stash,
                _resources=self))



[docs]
    def adhoc(self, corpus: Sequence[Dict[str, str]] = None,
              corpus_id: str = None, clear: bool = False) -> Resource:
        """Return a context manager for parsing and aligning adhoc documents.
        This sets the corpus documents that will be used for parsing and
        annotating.  The data will immediately be parsed into AMRs in this call
        and the data that writes to the file system will be updated to point to
        a new ``.../adhoc`` directory to not interfere with any corpus
        documents.

        The ``data`` input can be a file name that contains parsed parenthetical
        AMRs, a single document, or a sequence of documents.  The keys of each
        dictionary are the case-insensitive enumeration values of
        :class:`~zensols.amr.annotate.SentenceType`.  Keys ``id`` and
        ``comment`` are the unique document identifier and a comment that is
        added to the AMR sentence metadata.

        The following example JSON creates a document with ID ``ex1``, a
        ``comment`` metadata, one
        :obj:`~zensols.amr.annotate.SentenceType.SUMMARY` and two
        :obj:`~zensols.amr.annotate.SentenceType.BODY` sentences::

            corpus = [{
                "id": "ex1",
                "comment": "very short",
                "body": "The man ran to make the train. He just missed it.",
                "summary": "A man got caught in a train he just missed."
            }]

        This source / summary text can then be AMR parsed, aligned, and rendered
        with:

        .. code-block:: python

           from zensols.calamr import Resources, ApplicationFactory

           resources: Resources = ApplicationFactory.get_resources()

           with resources.adhoc(corpus, clear=True) as r:
               # render an aligned document
               r.alignments['some_key'].render()

        The ``clear=True`` means to delete all cached files generated in the
        block.

        Either ``corpus`` and/or ``corpus_id`` must be given.  If ``corpus`` is
        not given but ``corpus_id`` is, it will assume there is an existing set
        of data files to use for accessing.

        :param corpus: the AMR summary documents, which is usually a sequence of
                       :class:`~typing.Dict` instances (see
                       :class:`~zensols.amr.annotate.AnnotatedAmrFeatureDocumentFactory`
                       for data structure details)

        :param corpus_id: a unique identifier for ``data``, or ``None`` to use a
                          hashed string, which in turn, is used as the directory
                          name for the cached data

        :param clear: whether or not to deleted the cached files (parsed
                      documents, aligned graphs etc) after leaving the lexical
                      boundaries of the context manager

        """
        if corpus is None:
            if corpus_id is None:
                raise APIError('Either a corpus or corpus_id must be given')
            corpus = ()
        return _adhoc_resource(
            resource=Resource(
                documents=self._adhoc_doc_stash,
                alignments=self._flow_results_stash,
                _resources=self),
            corpus=corpus,
            corpus_id=corpus_id,
            clear=clear)



[docs]
    def restore(self, res: FlowGraphResult):
        """Restore the information on a flow graph result needed to render it.
        Without out it, :meth:`.FlowGraphResult.render` will raise errors.  This
        is only needed when unpickling a :class:`.FlowGraphResult`.

        :param res: to instance to have additional context information set

        """
        from .stash import FlowGraphRestoreStash
        stash: FlowGraphRestoreStash = self._flow_results_stash
        stash.restore(res)