Source code for zensols.mimic.parafac

"""Paragraph factories.

"""
__author__ = 'Paul Landes'

from typing import List, Set, Iterable, Optional, ClassVar
from dataclasses import dataclass, field
import logging
import re
from zensols.nlp import (
    LexicalSpan, FeatureToken, FeatureDocument, FeatureSentence
)
from zensols.nlp.chunker import ParagraphChunker, ListItemChunker
from zensols.mimic import Section, ParagraphFactory, MimicTokenDecorator

logger = logging.getLogger(__name__)



[docs]
class WhitespaceParagraphFactory(ParagraphFactory):
    """A simple paragraph factory that splits on whitespace.

    """
    SEPARATOR_REGEX: ClassVar[re.Pattern] = re.compile(r'\n[\s.]*\n')


[docs]
    def create(self, sec: Section) -> Iterable[FeatureDocument]:
        par_spans: List[LexicalSpan] = []
        bspan: LexicalSpan = sec.body_span
        bdoc: LexicalSpan = sec.body_doc
        marks: List[int] = [bspan.begin]
        for i in self.SEPARATOR_REGEX.finditer(sec.body):
            marks.extend((i.start() + bspan.begin, i.end() + bspan.begin))
        marks.append(bspan.end)
        mi = iter(marks)
        for beg in mi:
            par_spans.append(LexicalSpan(beg, next(mi)))
        ps: LexicalSpan
        for ps in par_spans:
            para: FeatureDocument = bdoc.get_overlapping_document(ps)
            para.text = ' '.join(map(lambda s: s.text.strip(), para))
            yield para





[docs]
@dataclass
class ChunkingParagraphFactory(ParagraphFactory):
    """A paragraph factory that uses :mod:`zensols.nlp.chunker` chunking to
    split paragraphs and MIMIC lists.

    """
    MIMIC_SPAN_PATTERN: ClassVar[re.Pattern] = re.compile(
        r'(.+?)(?:(?=[\n.]{2})|\Z)', re.MULTILINE | re.DOTALL)
    """MIMIC regular expression adds period, which is used in notes to separate
    paragraphs.

    """
    min_sent_len: int = field()
    """Minimum sentence length in tokens to be kept."""

    min_list_norm_matches: int = field()
    """The minimum amount of list matches needed to use the list item chunked
    version of the section.

    """
    max_sent_list_len: int = field()
    """The maximum lenght a sentence can be to keep it chunked as a list.
    Otherwise very long sentences form from what appear to be front list
    syntax.

    """
    include_section_headers: bool = field()
    """Whether to include section headers in the output."""

    filter_sent_text: Set[str] = field()
    """A set of sentence norm values to filter from replaced documents."""

    def _norm_list(self, doc: FeatureDocument) -> FeatureDocument:
        """Normalize itemized or enumerated lists if found."""
        chunker = ListItemChunker(doc)
        list_doc: FeatureDocument = chunker()
        if len(list_doc.sents) > 0:
            max_sent_len: int = max(map(lambda s: len(s.norm), list_doc.sents))
            if len(list_doc.sents) > self.min_list_norm_matches and \
               max_sent_len < self.max_sent_list_len:
                doc = list_doc
        return doc

    def _clone_norm_doc(self, doc: FeatureDocument) -> FeatureDocument:
        """Replace mangled token norms from original text."""
        clone: FeatureDocument = doc.clone()
        for tok in clone.token_iter():
            tok.norm = tok.text
        clone.clear()
        return clone

    def _norm_doc(self, parent: FeatureDocument, doc: FeatureDocument) -> \
            Optional[FeatureDocument]:
        """Normalize the document.  This removes empty sentences, MIMIC
        separators (long dashes) and chunks item lists.

        :param parent: the note document

        :param doc: the section document

        """
        def filter_toks(t: FeatureToken) -> bool:
            feat = t.mimic_ if hasattr(t, 'mimic_') else None
            return feat != MimicTokenDecorator.SEPARATOR_TOKEN_FEATURE and \
                len(t.norm.strip()) > 0

        def filter_sents(s: FeatureSentence) -> bool:
            return s.token_len > self.min_sent_len and \
                s.norm not in self.filter_sent_text

        # remove newlines that have space around them
        sent: FeatureSentence
        for sent in doc.sents:
            sent.tokens = tuple(filter(filter_toks, sent.token_iter()))
        doc.clear()

        # remove periods on lines by themselves
        doc.sents = tuple(filter(filter_sents, doc.sents))
        doc.clear()

        # chunk enumerated and itemized lists into sentences (if any)
        if self.min_list_norm_matches > 0:
            doc = self._norm_list(doc)
        # replace mangled token norms from original text
        doc = self._clone_norm_doc(doc)
        if doc.token_len > 0:
            doc.text = parent.text[doc.lexspan.begin:doc.lexspan.end]
        if doc.token_len > 0:
            doc.reindex()
            return doc


[docs]
    def create(self, sec: Section) -> Iterable[FeatureDocument]:
        include_headers: bool = self.include_section_headers
        parent: FeatureDocument = sec.container.doc
        doc: FeatureDocument
        span: LexicalSpan
        if include_headers:
            doc, span = sec.doc, sec.lexspan
        else:
            doc, span = sec.body_doc, sec.body_span
        assert isinstance(doc, FeatureDocument)
        # some section data is in the header, and thus, has no body
        if len(doc.sents) == 0:
            return []

        # chunk sections into paragraphs
        pc = ParagraphChunker(
            pattern=self.MIMIC_SPAN_PATTERN,
            doc=parent.clone(),
            sub_doc=doc,
            char_offset=span.begin)

        # normalize documents and prune empty (resulting from pruned sententces)
        return filter(lambda d: d is not None,
                      map(lambda d: self._norm_doc(parent, d), pc))