Source code for zensols.nlp.dataframe

"""Create Pandas dataframes from features.  This must be imported by absolute
module (:mod:`zensols.nlp.dataframe`).

"""
__author__ = 'Paul Landes'


from typing import Set, List, Tuple
from dataclasses import dataclass, field
import pandas as pd
from zensols.nlp import FeatureToken, FeatureDocument


[docs] @dataclass class FeatureDataFrameFactory(object): """Creates a Pandas dataframe of features from a document annotations. Each feature ID is given a column in the output :class:`pandas.DataFrame`. """ token_feature_ids: Set[str] = field( default=(FeatureToken.FEATURE_IDS | {'text'})) """The feature IDs to add to the :class:`pandas.DataFrame`.""" priority_feature_ids: Tuple[str, ...] = field( default=FeatureToken.WRITABLE_FEATURE_IDS) """Feature IDs that are used first in the column order in the output :class:`pandas.DataFrame`. """ def __call__(self, doc: FeatureDocument) -> pd.DataFrame: fids = self.token_feature_ids cols: List[str] = list(filter(lambda n: n in fids, self.priority_feature_ids)) cols.extend(sorted(fids - set(cols))) rows = [] for six, sent in enumerate(doc.sents): for tok in sent: feats = tok.asdict() if 'text' not in feats: feats['text'] = feats['norm'] rows.append(tuple(map(lambda f: feats.get(f), cols))) return pd.DataFrame(rows, columns=cols)