Source code for zensols.deeplearn.vectorize.domain

"""Vectorization base classes and basic functionality.

"""
__author__ = 'Paul Landes'

from typing import Tuple, Any, Union, ClassVar
from dataclasses import dataclass, field
from abc import abstractmethod, ABCMeta
import logging
import sys
import warnings
from io import TextIOBase
from scipy import sparse
from scipy.sparse import csr_matrix
import torch
from torch import Tensor
from zensols.persist import PersistableContainer
from zensols.config import ConfigFactory, Writable
from zensols.deeplearn import DeepLearnError, TorchConfig

logger = logging.getLogger(__name__)



[docs]
class VectorizerError(DeepLearnError):
    """Thrown by instances of :class:`.FeatureVectorizer` during encoding or
    decoding operations.

    """
    pass




[docs]
@dataclass
class ConfigurableVectorization(PersistableContainer, Writable):
    name: str = field()
    """The name of the section given in the configuration.

    """
    config_factory: ConfigFactory = field(repr=False)
    """The configuration factory that created this instance and used for
    serialization functions.

    """
    def __post_init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        self._write_line(f'name: {self.name}', depth, writer)





[docs]
@dataclass
class FeatureVectorizer(ConfigurableVectorization, metaclass=ABCMeta):
    """An asbstrct base class that transforms a Python object in to a PyTorch
    tensor.

    """
    feature_id: str = field()
    """Uniquely identifies this vectorizer."""

    def _allow_config_adds(self) -> bool:
        return True

    @abstractmethod
    def _get_shape(self) -> Tuple[int, ...]:
        pass


[docs]
    @abstractmethod
    def transform(self, data: Any) -> Tensor:
        """Transform ``data`` to a tensor data format.

        """
        pass


    @property
    def shape(self) -> Tuple[int, ...]:
        """Return the shape of the tensor created by ``transform``.

        """
        return self._get_shape()

    @property
    def description(self) -> str:
        """A short human readable name.

        :see: obj:`feature_id`

        """
        return self.DESCRIPTION


[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        super().write(depth, writer)
        self._write_line(f'description: {self.description}', depth, writer)
        self._write_line(f'feature_id: {self.feature_id}', depth, writer)
        self._write_line(f'shape: {self.shape}', depth, writer)


    def __str__(self):
        return (f'{self.feature_id} ({type(self)}), ' +
                f'desc={self.description}, shape: {self.shape}')

    def __repr__(self):
        return f'{self.__class__}: {self.__str__()}'




[docs]
@dataclass
class FeatureContext(PersistableContainer):
    """Data created by coding and meant to be pickled on the file system.

    :see EncodableFeatureVectorizer.encode:

    """
    feature_id: str = field()
    """The feature id of the :class:`.FeatureVectorizer` that created this
    context.

    """
    def __str__(self):
        return f'{self.__class__.__name__} ({self.feature_id})'




[docs]
@dataclass
class NullFeatureContext(FeatureContext):
    """A no-op feature context used for cases such as prediction batches with
    data points that have no labels.

    :see: :meth:`~zensols.deeplearn.batch.BatchStash.create_prediction`

    :see: :class:`~zensols.deeplearn.batch.Batch`

    """
    pass




[docs]
@dataclass
class TensorFeatureContext(FeatureContext):
    """A context that encodes data directly to a tensor.  This tensor could be a
    sparse matrix becomes dense during the decoding process.

    """
    tensor: Tensor = field()
    """The output tensor of the encoding phase."""


[docs]
    def deallocate(self):
        super().deallocate()
        if hasattr(self, 'tensor'):
            del self.tensor



[docs]
    def write(self, depth: int = 0, writer: TextIOBase = sys.stdout):
        super().write(depth, writer)
        self._write_line(f'tensor shape: {self.tensor.shape}', depth, writer)


    def __str__(self):
        tstr = f'{self.tensor.shape}' if self.tensor is not None else '<none>'
        return f'{super().__str__()}: {tstr}'

    def __repr__(self):
        return self.__str__()




[docs]
@dataclass
class SparseTensorFeatureContext(FeatureContext):
    """Contains data that was encded from a dense matrix as a sparse matrix and
    back.  Using torch sparse matrices currently lead to deadlocking in child
    proceesses, so use scipy :class:``csr_matrix`` is used instead.

    """
    USE_SPARSE: ClassVar[bool] = True
    """Whether or not to enable sparse matrix serialization.  Otherwise, torch
    tensors (no conversion) are used.

    """
    sparse_data: Union[Tuple[Tuple[csr_matrix, int]], Tensor] = field()
    """The sparse array data."""

    @property
    def sparse_arr(self) -> Tuple[csr_matrix]:
        assert isinstance(self.sparse_data[0], tuple)
        return self.sparse_data[0]


[docs]
    @classmethod
    def to_sparse(cls, arr: Tensor) -> Tuple[csr_matrix]:
        narr = arr.numpy()
        tdim = len(arr.shape)
        if tdim == 3:
            narrs = tuple(map(lambda i: narr[i], range(narr.shape[0])))
        elif tdim == 2 or tdim == 1:
            narrs = (narr,)
        else:
            raise VectorizerError('Tensors of dimensions higher than ' +
                                  f'3 not supported: {arr.shape}')
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'creating sparse matrix: {arr.shape}')
        mats = tuple(map(lambda m: sparse.csr_matrix(m), narrs))
        return (mats, tdim)



[docs]
    @classmethod
    def instance(cls, feature_id: str, arr: Tensor,
                 torch_config: TorchConfig):
        arr = arr.cpu()
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'encoding in to sparse tensor: {arr.shape}')
        if cls.USE_SPARSE:
            if False:
                raise VectorizerError(
                    ('Type float 16 is not supported (scipy 1.9.3); set ' +
                     'SparseTensorFeatureContext.USE_SPARSE to False or ' +
                     'deeplearn_default.fp_precision higher'))
            with warnings.catch_warnings():
                # silence the numpy warningsin scipy package
                #
                # scipy/sparse/_sputils.py:44: DeprecationWarning:
                # np.find_common_type is deprecated.  Please use
                # `np.result_type` or `np.promote_types`.
                warnings.filterwarnings(
                    'ignore',
                    message=r"^np.find_common_type is deprecated",
                    category=DeprecationWarning)
                sarr = cls.to_sparse(arr)
        else:
            sarr = arr
        return cls(feature_id, sarr)



[docs]
    def to_tensor(self, torch_config: TorchConfig) -> Tensor:
        if isinstance(self.sparse_arr, Tensor):
            tarr = self.sparse_arr
        else:
            narr: Tuple[csr_matrix]
            tdim: int
            narr, tdim = self.sparse_data
            narrs = tuple(map(lambda sm: torch.from_numpy(sm.todense()), narr))
            if len(narrs) == 1:
                tarr = narrs[0]
            else:
                tarr = torch.stack(narrs)
            dim_diff = len(tarr.shape) - tdim
            if dim_diff > 0:
                for _ in range(dim_diff):
                    tarr = tarr.squeeze(0)
            elif dim_diff < 0:
                for _ in range(-dim_diff):
                    tarr = tarr.unsqueeze(0)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'decoded sparce matrix to: {tarr.shape}')
        return tarr





[docs]
@dataclass
class MultiFeatureContext(FeatureContext):
    """A composite context that contains a tuple of other contexts.

    """
    contexts: Tuple[FeatureContext] = field()
    """The subordinate contexts."""

    @property
    def is_empty(self) -> bool:
        cnt = sum(1 for _ in filter(
            lambda c: not isinstance(c, NullFeatureContext), self.contexts))
        return cnt == 0


[docs]
    def deallocate(self):
        super().deallocate()
        if hasattr(self, 'contexts'):
            self._try_deallocate(self.contexts)
            del self.contexts