Source code for zensols.deepnlp.vectorize.spacy

"""Feature (ID) normalization.

"""
from __future__ import annotations
__author__ = 'Paul Landes'
from typing import Tuple, Dict, Any, Sequence, Iterable, Union, ClassVar
from dataclasses import dataclass, field
import sys
import logging
import math
import itertools as it
from frozendict import frozendict
from spacy.language import Language
from torch import Tensor
from zensols.deeplearn import TorchConfig
from zensols.deeplearn.vectorize import FeatureVectorizer
from zensols.deeplearn.vectorize import VectorizerError

logger = logging.getLogger(__name__)


[docs] @dataclass(repr=False) class SpacyFeatureVectorizer(FeatureVectorizer): """This normalizes feature IDs of parsed token features in to a number between [0, 1]. This is useful for normalized feature vectors as input to neural networks. Input to this would be strings like ``token.ent_`` found on a :class:`zensols.nlp.feature.TokenAttributes` instance. The class is also designed to create features using indexes, so there are methods to resolve to a unique ID from an identifier. Instances of this class behave like a ``dict``. All symbols are taken from :obj:`spacy.glossary.GLOSSARY`. :param vocab: the vocabulary used for ``from_spacy`` to compute the normalized feature from the spacy ID (i.e. ``token.ent_``, ``token.tag_`` etc.) :see: :obj:`spacy.glossary.GLOSSARY` :see: :class:`zensols.nlp.feature.TokenAttributes` """ description: str = field() """A short human readable name. :see: obj:`feature_id` """ torch_config: TorchConfig = field() """The torch configuration used to create tensors.""" model: Language = field() """The spaCy vocabulary used to create IDs from strings. :see meth:`id_from_spacy_symbol` """ symbols: Union[str, Sequence[str]] = field() """The list of symbols to vectorize and provided by spaCy as a feature if a tuple or list. If a string, then use it as the name of the pipe with the ``labels`` attribute. """ def _map_symbols(self, symbols: Union[str, Sequence[str]]) -> Sequence[str]: if isinstance(symbols, str): symbols = self.model.get_pipe(symbols).labels elif isinstance(symbols, list): symbols = tuple(symbols) elif not isinstance(symbols, tuple): raise VectorizerError( f'Wrong type for symbols: {type(symbols)}') if len(symbols) <= 1: raise VectorizerError( f'Symbol list is too short: {len(symbols)}') return symbols def _initialize(self, symbols: Union[str, Sequence[str]]): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'configuring spacy vectorizer: {self.feature_id}') self.symbol_to_id: Dict[str, int] = dict(zip(symbols, it.count())) n: int = len(self.symbol_to_id) q: int = n - 1 arr: Tensor = self._to_hot_coded_matrix(n) rows: Iterable[Tuple[str, int], ...] = \ zip(self.symbol_to_id, map(lambda i: arr[i], range(n))) self.symbol_to_vector: Dict[str, int] = frozendict(rows) self.symbol_to_norm: Dict[str, float] = \ frozendict(map(lambda t: (t[0], t[1] / q), self.symbol_to_id.items())) @property def _symbols(self) -> Sequence[str]: return self._symbols_val @_symbols.setter def _symbols(self, symbols: Union[str, Sequence[str]]): symbols = self._map_symbols(symbols) self._symbols_val = symbols self._initialize(symbols) @property def _description(self) -> str: return self._description_val @_description.setter def _description(self, description: str): self._description_val = description def _is_settable(self, name: str, value: Any) -> bool: return False def _to_hot_coded_matrix(self, rows: int) -> Tensor: arr: Tensor = self.torch_config.zeros((rows, rows)) for i in range(rows): arr[i][i] = 1 return arr def _to_binary_matrix(self, rows: int): cols = math.ceil(math.log2(rows)) arr = self.torch_config.empty((rows, rows)) for i in range(rows): sbin = '{0:b}'.format(i).zfill(cols) arr[i] = self.torch_config.from_iterable(map(float, sbin)) return arr def _get_shape(self) -> Tuple[int, int]: return 1, len(self.symbols)
[docs] def transform(self, symbol: str) -> Tensor: return self.symbol_to_vector[symbol]
[docs] def dist(self, symbol: str) -> float: """Return a normalized feature float if ``symbol`` is found. :return: a normalized value between [0 - 1] or ``None`` if the symbol isn't found """ return self.symbol_to_norm[symbol]
[docs] def id_from_spacy_symbol(self, id: int, default: int = -1) -> str: """Return the Spacy text symbol for it's ID (``token.ent`` -> ``token.ent_``). """ strs: str = self.model.vocab.strings if id in strs: return strs[id] else: return default
[docs] def from_spacy(self, id: int) -> Tensor: """Return a binary feature from a Spacy ID or ``None`` if it doesn't have a mapping the ID. """ symbol: str = self.id_from_spacy_symbol(id) return self.symbol_to_vector.get(symbol, None)
[docs] def id_from_spacy(self, id: int, default: int = -1) -> int: """Return the ID of this vectorizer for the Spacy ID or -1 if not found. """ symbol = self.id_from_spacy_symbol(id) return self.symbol_to_id.get(symbol, default)
[docs] def write(self, writer=sys.stdout): """Pretty print a human readable representation of this feature vectorizer. """ syms = self.symbol_to_id writer.write(f'{self.description}:\n') for k in sorted(syms.keys()): writer.write(f' {k} => {syms[k]} ({self.transform(k)})\n')
def __str__(self) -> str: return self.feature_id def __repr__(self) -> str: return f'{self.feature_id}: {self.description}, len={len(self.symbols)}'
SpacyFeatureVectorizer.description = SpacyFeatureVectorizer._description SpacyFeatureVectorizer.symbols = SpacyFeatureVectorizer._symbols
[docs] @dataclass class NamedEntityRecognitionFeatureVectorizer(SpacyFeatureVectorizer): """A feature vectorizor for NER tags. :see: :class:`.SpacyFeatureVectorizer` """ DESCRIPTION: ClassVar[str] = 'named entity recognition' LANG: ClassVar[str] = 'en' FEATURE_ID: ClassVar[str] = 'ent' SYMBOLS: ClassVar[str] = """PERSON NORP FACILITY FAC ORG GPE LOC PRODUCT EVENT WORK_OF_ART LAW LANGUAGE DATE TIME PERCENT MONEY QUANTITY ORDINAL CARDINAL PER MISC"""
[docs] @dataclass class DependencyFeatureVectorizer(SpacyFeatureVectorizer): """A feature vectorizor for dependency head trees. :see: :class:`.SpacyFeatureVectorizer` """ DESCRIPTION: ClassVar[str] = 'dependency' LANG: ClassVar[str] = 'en' FEATURE_ID: ClassVar[str] = 'dep' SYMBOLS: ClassVar[str] = """acl acomp advcl advmod agent amod appos attr aux auxpass case cc ccomp clf complm compound conj cop csubj csubjpass dative dep det discourse dislocated dobj expl fixed flat goeswith hmod hyph infmod intj iobj list mark meta neg nmod nn npadvmod nsubj nsubjpass nounmod npmod num number nummod oprd obj obl orphan parataxis partmod pcomp pobj poss possessive preconj prep prt punct quantmod rcmod relcl reparandum root vocative xcomp ROOT"""
[docs] @dataclass class PartOfSpeechFeatureVectorizer(SpacyFeatureVectorizer): """A feature vectorizor for POS tags. :see: :class:`.SpacyFeatureVectorizer` """ DESCRIPTION: ClassVar[str] = 'part of speech' LANG: ClassVar[str] = 'en' FEATURE_ID: ClassVar[str] = 'tag' SYMBOLS: ClassVar[str] = """ADJ ADP ADV AUX CONJ CCONJ DET INTJ NOUN NUM PART PRON PROPN PUNCT SCONJ SYM VERB X EOL SPACE . , -LRB- -RRB- `` " ' $ # AFX CC CD DT EX FW HYPH IN JJ JJR JJS LS MD NIL NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP TO UH VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB SP ADD NFP GW XX BES HVS NP PP VP ADVP ADJP SBAR PRT PNP"""