Source code for zensols.nlp.serial

"""Serializes :class:`.FeatureToken` and :class:`.TokenContainer` instances
using the :class:`~zensols.config.dictable.Dictable` interface.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Any, Dict, List, Set, Union
from dataclasses import dataclass, field
from abc import ABCMeta, abstractmethod
from enum import Enum, auto
from collections import OrderedDict
from zensols.config import Dictable
from . import (
    NLPError, FeatureToken, TokenContainer,
    FeatureSpan, FeatureSentence, FeatureDocument
)


[docs] class Include(Enum): """Indicates what to include at each level. """ original = auto() """The original text.""" normal = auto() """The normalized form of the text.""" tokens = auto() """The tokens of the :class:`.TokenContainer`.""" sentences = auto() """The sentences of the :class:`.FeatureDocument`."""
[docs] @dataclass class Serialized(Dictable, metaclass=ABCMeta): """A base strategy class that can serialize :class:`.TokenContainer` instances. """ container: TokenContainer = field() """The container to be serialized.""" includes: Set[Include] = field() """The things to be included at the level of the subclass serializer.""" feature_ids: Tuple[str, ...] = field() """The feature IDs used when serializing tokens.""" @abstractmethod def _serialize(self) -> Dict[str, Any]: """Implemented to serialize :obj:`container` in to a dictionary.""" pass def _from_dictable(self, recurse: bool, readable: bool, class_name_param: str = None) -> Dict[str, Any]: return self._serialize()
[docs] @dataclass class SerializedTokenContainer(Serialized): """Serializes instance of :class:`.TokenContainer`. This is used to serialize spans and sentences. """ def _feature_tokens(self, container: TokenContainer) -> \ List[Dict[str, Any]]: """Serialize tokens of ``container`` in to a list of dictionary features. """ tok_feats: List[Dict[str, Any]] = [] tok: FeatureToken for tok in container.token_iter(): tfs: Dict[str, Any] = tok.get_features(self.feature_ids) if len(tfs) > 0: tok_feats.append(tfs) return tok_feats def _serialize(self) -> Dict[str, Any]: dct = OrderedDict() if Include.original in self.includes: dct[Include.original.name] = self.container.text if Include.normal in self.includes: dct[Include.normal.name] = self.container.norm if Include.tokens in self.includes: dct[Include.tokens.name] = self._feature_tokens(self.container) return dct
[docs] @dataclass class SerializedFeatureDocument(Serialized): """A serializer for feature documents. The :obj:`container` has to be an instance of a :class:`.FeatureDocument`. """ sentence_includes: Set[Include] = field() """The list of things to include in the sentences of the document.""" def _serialize(self) -> Dict[str, Any]: doc = SerializedTokenContainer( container=self.container, includes=self.includes, feature_ids=self.feature_ids) dct = OrderedDict(doc=doc.asdict()) if Include.sentences in self.includes: sents: List[Dict[str, Any]] = [] dct[Include.sentences.name] = sents sent: FeatureSentence for sent in self.container.sents: ser = SerializedTokenContainer( container=sent, includes=self.sentence_includes, feature_ids=self.feature_ids) sents.append(ser.asdict()) return dct
[docs] @dataclass class SerializedTokenContainerFactory(Dictable): """Creates instances of :class:`.Serialized` from instances of :class:`.TokenContainer`. These can then be used as :class:`~zensols.config.dictable.Dictable` instances, specifically with the ``asdict`` and ``asjson`` methods. """ sentence_includes: Set[Union[Include, str]] = field() """The things to be included in sentences.""" document_includes: Set[Union[Include, str]] = field() """The things to be included in documents.""" feature_ids: Tuple[str, ...] = field(default=None) """The feature IDs used when serializing tokens.""" def __post_init__(self): def map_thing(x): if isinstance(x, str): x = Include.__members__[x] return x # convert strings to enums for easy app configuration for ai in 'sentence document'.split(): attr = f'{ai}_includes' val = set(map(map_thing, getattr(self, attr))) setattr(self, attr, val)
[docs] def create(self, container: TokenContainer) -> Serialized: """Create a serializer from ``container`` (see class docs). :param container: he container to be serialized :return: an object that can be serialized using ``asdict`` and ``asjson`` method. """ serialized: Serialized if isinstance(container, FeatureDocument): serialized = SerializedFeatureDocument( container=container, includes=self.document_includes, sentence_includes=self.sentence_includes, feature_ids=self.feature_ids) elif isinstance(container, FeatureSpan): serialized = SerializedFeatureDocument( conatiner=container, includes=self.sentence_includes) else: raise NLPError(f'No serialization method for {type(container)}') return serialized
def __call__(self, container: TokenContainer) -> Serialized: """See :meth:`create`.""" return self.create(container)