Source code for zensols.deepnlp.vectorize.manager

"""An extension of a feature vectorizer manager that parses and vectorized
natural language.

"""
__author__ = 'Paul Landes'

from typing import List, Union, Set, Dict, Tuple, Sequence
from dataclasses import dataclass, field
from enum import Enum, auto
from abc import abstractmethod, ABCMeta
import logging
import collections
import torch
from torch import Tensor
from zensols.persist import persisted, PersistedWork
from zensols.deeplearn.vectorize import (
    FeatureContext,
    FeatureVectorizerManager,
    VectorizerError,
    TransformableFeatureVectorizer,
    MultiFeatureContext,
)
from zensols.nlp import FeatureSentence, FeatureDocument, FeatureDocumentParser
from . import SpacyFeatureVectorizer

logger = logging.getLogger(__name__)


[docs] class TextFeatureType(Enum): """The type of :class:`.FeatureDocumentVectorizer`. """ TOKEN = auto() """Token level with a shape congruent with the number of tokens, typically concatenated with the embedding layer. """ DOCUMENT = auto() """Document level, typically added to a join layer.""" MULTI_DOCUMENT = auto() """"Multiple documents for the purposes of aggregating shared features.""" EMBEDDING = auto() """Embedding layer, typically used as the input layer.""" NONE = auto() """Other type, which tells the framework to ignore the vectorized features. :see: :class:`~zensols.deepnlp.layer.embed.EmbeddingNetworkModule` """
[docs] @dataclass class FeatureDocumentVectorizer(TransformableFeatureVectorizer, metaclass=ABCMeta): """Creates document or sentence level features using instances of :class:`.TokenContainer`. Subclasses implement specific vectorization on a single document using :meth:`_encode`, and it is up to the subclass to decide how to vectorize the document. Multiple documents as an aggregrate given as a list or tuple of documents is supported. Only the document level vectorization is supported to provide one standard contract across framework components and vectorizers. If more than one document is given during encoding it and will be combined in to one document as described using an :obj:`.FoldingDocumentVectorizer.encoding_level` = ``concat_tokens``. :see: :class:`.FoldingDocumentVectorizer` """ @abstractmethod def _encode(self, doc: FeatureDocument) -> FeatureContext: pass def _is_mult(self, doc: Union[Tuple[FeatureDocument, ...], FeatureDocument]) -> bool: """Return ``True`` or not the input is a tuple (multiple) documents.""" return isinstance(doc, (Tuple, List)) def _is_doc(self, doc: Union[Tuple[FeatureDocument, ...], FeatureDocument]): """Return whether ``doc`` is a :class:`.FeatureDocument`.""" if self._is_mult(doc): docs = doc for doc in docs: if not self._is_doc(doc): return False elif not isinstance(doc, FeatureDocument): return False return True def _combine_documents(self, docs: Tuple[FeatureDocument, ...]) -> \ FeatureDocument: return FeatureDocument.combine_documents(docs)
[docs] def encode(self, doc: Union[Tuple[FeatureDocument, ...], FeatureDocument]) -> FeatureContext: """Encode by combining documents in to one monolithic document when a tuple is passed, otherwise default to the super class's encode functionality. """ self._assert_doc(doc) if self._is_mult(doc): doc = self._combine_documents(doc) return super().encode(doc)
def _assert_doc(self, doc: Union[Tuple[FeatureDocument, ...], FeatureDocument]): if not self._is_doc(doc): raise VectorizerError( f'Expecting FeatureDocument, but got type: {type(doc)}') def _assert_decoded_doc_dim(self, arr: Tensor, expect: int): """Check the decoded document dimesion and rase an error for those that do not match. """ if len(arr.size()) != expect: raise VectorizerError(f'Expecting {expect} tensor dimensions, ' + f'but got shape: {arr.shape}') @property def feature_type(self) -> TextFeatureType: """The type of feature this vectorizer generates. This is used by classes such as :class:`~zensols.deepnlp.layer.EmbeddingNetworkModule` to determine where to add the features, such as concating to the embedding layer, join layer etc. """ return self.FEATURE_TYPE @property def token_length(self) -> int: """The number of token features (if token level) generated.""" return self.manager.token_length def __str__(self): return (f'{super().__str__()}, ' + f'feature type: {self.feature_type.name} ')
[docs] @dataclass class FoldingDocumentVectorizer(FeatureDocumentVectorizer, metaclass=ABCMeta): """This class is like :class:`.FeatureDocumentVectorizer`, but provides more options in how to fold multiple documents in a single document for vectorization. Based on the value of :obj:`fold_method`, this class encodes a sequence of :class:`~zensols.nlp.container.FeatureDocument` instances differently. Subclasses must implement :meth:`_encode`. *Note*: this is not to be confused with the :class:`.MultiDocumentVectorizer` vectorizer, which vectorizes multiple documents in to document level features. """ _FOLD_METHODS = frozenset('raise concat_tokens sentence separate'.split()) fold_method: str = field() """How multiple documents are merged in to a single document for vectorization, which is one of: * ``raise``: raise an error allowing only single documents to be vectorized * ``concat_tokens``: concatenate tokens of each document in to singleton sentence documents; uses :meth:`~zensols.nlp.container.FeatureDocument.combine_documents` with ``concat_tokens = True`` * ``sentence``: all sentences of all documents become singleton sentence documents; uses :meth:`~zensols.nlp.container.FeatureDocument.combine_documents` with ``concat_tokens = False`` * ``separate``: every sentence of each document is encoded separately, then the each sentence output is concatenated as the respsective document during decoding; this uses the :meth:`_encode` for each sentence of each document and :meth:`_decode` to decode back in to the same represented document structure as the original """ def __post_init__(self): super().__post_init__() if self.fold_method not in self._FOLD_METHODS: raise VectorizerError(f'No such fold method: {self.fold_method}') def _combine_documents(self, docs: Tuple[FeatureDocument, ...]) -> \ FeatureDocument: if self.fold_method == 'raise' and len(docs) > 1: raise VectorizerError( f'Configured to support single document but got {len(docs)}') concat_tokens = self.fold_method == 'concat_tokens' if logger.isEnabledFor(logging.DEBUG): logger.debug(f'foldl method: {self.fold_method}, ' + f'concat_tokens={concat_tokens}') return FeatureDocument.combine_documents( docs, concat_tokens=concat_tokens) def _encode_sentence(self, sent: FeatureSentence) -> FeatureContext: """Encode a single sentence document. """ sent_doc: FeatureDocument = sent.to_document() return super().encode(sent_doc) def _encode_sentences(self, doc: FeatureDocument) -> FeatureContext: docs: Sequence[FeatureDocument] = doc if self._is_mult(doc) else [doc] doc_ctxs: List[List[FeatureContext]] = [] if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoding {len(docs)} documents') # iterate over each document passed (usually as an aggregate from the # batch framework) doc: FeatureDocument for doc in docs: sent_ctxs: List[FeatureContext] = [] # concatenate each encoded sentence to become the document sent: FeatureSentence for sent in doc.sents: ctx = self._encode_sentence(sent) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoded {sent}: {ctx}') sent_ctxs.append(ctx) # add the multi-context of the sentences doc_ctxs.append(MultiFeatureContext( feature_id=None, contexts=tuple(sent_ctxs))) return MultiFeatureContext(self.feature_id, tuple(doc_ctxs))
[docs] def encode(self, doc: Union[Tuple[FeatureDocument, ...], FeatureDocument]) -> FeatureContext: ctx: FeatureContext if self.fold_method == 'concat_tokens' or \ self.fold_method == 'sentence': ctx = super().encode(doc) elif self.fold_method == 'separate': self._assert_doc(doc) ctx = self._encode_sentences(doc) elif self.fold_method == 'raise': if self._is_mult(doc): raise VectorizerError( f'Expecting single document but got: {len(doc)} documents') ctx = super().encode(doc) return ctx
def _create_decoded_pad(self, shape: Tuple[int, ...]) -> Tensor: return self.torch_config.zeros(shape) def _decode_sentence(self, sent_ctx: FeatureContext) -> Tensor: return super().decode(sent_ctx) def _decode_sentences(self, context: MultiFeatureContext, sent_dim: int = 1) -> Tensor: darrs: List[Tensor] = [] # each multi-context represents a document with sentence context # elements doc_ctx: Tuple[MultiFeatureContext, ...] for doc_ctx in context.contexts: sent_arrs: List[Tensor] = [] # decode each sentence and track their decoded tensors for later # concatenation sent_ctx: FeatureContext for sent_ctx in doc_ctx.contexts: arr = self._decode_sentence(sent_ctx) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'decoded sub context: {sent_ctx} ' + f'-> {arr.size()}') sent_arrs.append(arr) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'concat {len(sent_arrs)} along dim {sent_dim}') # concat all sentences for this document in to one long vector with # shape (batch, |tokens|, transformer dim) sent_arr: Tensor = torch.cat(sent_arrs, dim=sent_dim) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'sentence cat: {sent_arr.size()}') darrs.append(sent_arr) # create document array of shape (batch, |tokens|, transformer dim) by # first finding the longest document token count max_sent_len = max(map(lambda t: t.size(sent_dim), darrs)) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'max sent len: {max_sent_len}') arr = self._create_decoded_pad(( len(context.contexts), max_sent_len, darrs[0][0].size(-1))) # copy over each document (from sentence concats) to the decoded tensor for dix, doc_arr in enumerate(darrs): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'sent array: {doc_arr.shape}') arr[dix, :doc_arr.size(1), :] = doc_arr n_squeeze = len(arr.shape) - len(self.shape) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'squeezing {n_squeeze}, {arr.shape} -> {self.shape}') for _ in range(n_squeeze): arr = arr.squeeze(dim=-1) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'vectorized shape: {arr.shape}') return arr
[docs] def decode(self, context: FeatureContext) -> Tensor: arr: Tensor if self.fold_method == 'separate': arr = self._decode_sentences(context) else: arr = super().decode(context) return arr
[docs] @dataclass class MultiDocumentVectorizer(FeatureDocumentVectorizer, metaclass=ABCMeta): """Vectorizes multiple documents into document level features. Features generated by subclasses are sometimes used in join layers. Examples include :class:`.OverlappingFeatureDocumentVectorizer`. This is not to be confused with :class:`.FoldingDocumentVectorizer`, which merges multiple documents in to a single document for vectorization. """ FEATURE_TYPE = TextFeatureType.DOCUMENT
[docs] def encode(self, docs: Tuple[FeatureDocument, ...]) -> FeatureContext: return self._encode(docs)
[docs] @dataclass class FeatureDocumentVectorizerManager(FeatureVectorizerManager): """Creates and manages instances of :class:`.FeatureDocumentVectorizer` and parses text in to feature based document. This is used to manage the relationship of a given set of parsed features keeping in mind that parsing will usually happen as a preprocessing step. A second step is the vectorization of those features, which can be any proper subset of those features parsed in the previous step. However, these checks, of course, are not necessary if pickling isn't used across the parse and vectorization steps. Instances can set a hard fixed token length, but which vectorized tensors have a like fixed width based on the setting of :obj:`token_length`. However, this can also be set to use the longest sentence of the document, which is useful when computing vectorized tensors from the document as a batch, even if the input data are batched as a group of sentences in a document. :see: :class:`.FeatureDocumentVectorizer` :see :meth:`parse` """ doc_parser: FeatureDocumentParser = field() """Used to :meth:`parse` documents.""" token_length: int = field() """The length of tokens used in fixed length features. This is used as a dimension in decoded tensors. If this value is ``-1``, use the longest sentence of the document as the token length, which is usually counted as the batch. :see: :meth:`get_token_length` """ token_feature_ids: Set[str] = field(default=None) """Indicates which spaCy parsed features to generate in the vectorizers held in this instance. Examples include ``norm``, ``ent``, ``dep``, ``tag``. If this is not set, it defaults to the the `token_feature_ids` in :obj:`doc_parser`. :see: :obj:`.SpacyFeatureVectorizer.VECTORIZERS` """ def __post_init__(self): super().__post_init__() if logger.isEnabledFor(logging.DEBUG): logger.debug('creating fd vec manager') if self.token_feature_ids is None: self.token_feature_ids = self.doc_parser.token_feature_ids else: feat_diff = self.token_feature_ids - \ self.doc_parser.token_feature_ids if len(feat_diff) > 0: fdiffs = ', '.join(feat_diff) raise VectorizerError( 'Parser token features do not exist in vectorizer: ' + f'{self.token_feature_ids} - ' + f'{self.doc_parser.token_feature_ids} = {fdiffs}') self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self) @property def is_batch_token_length(self) -> bool: """Return whether or not the token length is variable based on the longest token length in the batch. """ return self.token_length < 0
[docs] def get_token_length(self, doc: FeatureDocument) -> int: """Get the token length for the document. If :obj:`is_batch_token_length` is ``True``, then the token length is computed based on the longest sentence in the document ``doc``. See the class docs. :param doc: used to compute the longest sentence if :obj:`is_batch_token_length` is ``True`` :return: the (global) token length for the document """ if self.is_batch_token_length: return doc.max_sentence_len else: return self.token_length
[docs] def parse(self, text: Union[str, List[str]], *args, **kwargs) -> \ FeatureDocument: """Parse text or a text as a list of sentences. **Important**: Parsing documents through this manager instance is better since safe checks are made that features are available from those used when documents are parsed before pickling. :param text: either a string or a list of strings; if the former a document with one sentence will be created, otherwise a document is returned with a sentence for each string in the list """ return self.doc_parser.parse(text, *args, **kwargs)
@property @persisted('_spacy_vectorizers') def spacy_vectorizers(self) -> Dict[str, SpacyFeatureVectorizer]: """Return vectorizers based on the :obj:`token_feature_ids` configured on this instance. Keys are token level feature ids found in :obj:`.SpacyFeatureVectorizer.VECTORIZERS`. :return: an :class:`collections.OrderedDict` of vectorizers """ if logger.isEnabledFor(logging.DEBUG): logger.debug('creating spacy vectorizers') token_feature_ids = set(SpacyFeatureVectorizer.VECTORIZERS.keys()) token_feature_ids = token_feature_ids & self.token_feature_ids token_feature_ids = sorted(token_feature_ids) vectorizers = collections.OrderedDict() if logger.isEnabledFor(logging.DEBUG): logger.debug(f'creating token features: {token_feature_ids}') for feature_id in sorted(token_feature_ids): cls = SpacyFeatureVectorizer.VECTORIZERS[feature_id] inst = cls(name=f'spacy vectorizer: {feature_id}', config_factory=self.config_factory, feature_id=feature_id, torch_config=self.torch_config, vocab=self.doc_parser.model.vocab) vectorizers[feature_id] = inst if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created {len(vectorizers)} vectorizers') return vectorizers
[docs] def deallocate(self): if self._spacy_vectorizers.is_set(): vecs = self.spacy_vectorizers for vec in vecs.values(): vec.deallocate() vecs.clear() super().deallocate()