Source code for zensols.nlp.dataframe

"""Create Pandas dataframes from features.  This must be imported by absolute
module (:mod:`zensols.nlp.dataframe`).

"""
__author__ = 'Paul Landes'


from typing import Set, List, Tuple
from dataclasses import dataclass, field
import pandas as pd
from zensols.nlp import FeatureToken, FeatureDocument



[docs]
@dataclass
class FeatureDataFrameFactory(object):
    """Creates a Pandas dataframe of features from a document annotations.  Each
    feature ID is given a column in the output :class:`pandas.DataFrame`.

    """
    token_feature_ids: Set[str] = field(
        default=(FeatureToken.FEATURE_IDS | {'text'}))
    """The feature IDs to add to the :class:`pandas.DataFrame`."""

    priority_feature_ids: Tuple[str, ...] = field(
        default=FeatureToken.WRITABLE_FEATURE_IDS)
    """Feature IDs that are used first in the column order in the output
    :class:`pandas.DataFrame`.

    """
    def __call__(self, doc: FeatureDocument) -> pd.DataFrame:
        fids = self.token_feature_ids
        cols: List[str] = list(filter(lambda n: n in fids,
                                      self.priority_feature_ids))
        cols.extend(sorted(fids - set(cols)))
        rows = []
        for six, sent in enumerate(doc.sents):
            for tok in sent:
                feats = tok.asdict()
                if 'text' not in feats:
                    feats['text'] = feats['norm']
                rows.append(tuple(map(lambda f: feats.get(f), cols)))
        return pd.DataFrame(rows, columns=cols)