[docs]classWhitespaceParagraphFactory(ParagraphFactory):"""A simple paragraph factory that splits on whitespace. """SEPARATOR_REGEX:ClassVar[re.Pattern]=re.compile(r'\n[\s.]*\n')
[docs]@dataclassclassChunkingParagraphFactory(ParagraphFactory):"""A paragraph factory that uses :mod:`zensols.nlp.chunker` chunking to split paragraphs and MIMIC lists. """MIMIC_SPAN_PATTERN:ClassVar[re.Pattern]=re.compile(r'(.+?)(?:(?=[\n.]{2})|\Z)',re.MULTILINE|re.DOTALL)"""MIMIC regular expression adds period, which is used in notes to separate paragraphs. """min_sent_len:int=field()"""Minimum sentence length in tokens to be kept."""min_list_norm_matches:int=field()"""The minimum amount of list matches needed to use the list item chunked version of the section. """max_sent_list_len:int=field()"""The maximum lenght a sentence can be to keep it chunked as a list. Otherwise very long sentences form from what appear to be front list syntax. """include_section_headers:bool=field()"""Whether to include section headers in the output."""filter_sent_text:Set[str]=field()"""A set of sentence norm values to filter from replaced documents."""def_norm_list(self,doc:FeatureDocument)->FeatureDocument:"""Normalize itemized or enumerated lists if found."""chunker=ListItemChunker(doc)list_doc:FeatureDocument=chunker()iflen(list_doc.sents)>0:max_sent_len:int=max(map(lambdas:len(s.norm),list_doc.sents))iflen(list_doc.sents)>self.min_list_norm_matchesand \
max_sent_len<self.max_sent_list_len:doc=list_docreturndocdef_clone_norm_doc(self,doc:FeatureDocument)->FeatureDocument:"""Replace mangled token norms from original text."""clone:FeatureDocument=doc.clone()fortokinclone.token_iter():tok.norm=tok.textclone.clear()returnclonedef_norm_doc(self,parent:FeatureDocument,doc:FeatureDocument)-> \
Optional[FeatureDocument]:"""Normalize the document. This removes empty sentences, MIMIC separators (long dashes) and chunks item lists. :param parent: the note document :param doc: the section document """deffilter_toks(t:FeatureToken)->bool:feat=t.mimic_ifhasattr(t,'mimic_')elseNonereturnfeat!=MimicTokenDecorator.SEPARATOR_TOKEN_FEATUREand \
len(t.norm.strip())>0deffilter_sents(s:FeatureSentence)->bool:returns.token_len>self.min_sent_lenand \
s.normnotinself.filter_sent_text# remove newlines that have space around themsent:FeatureSentenceforsentindoc.sents:sent.tokens=tuple(filter(filter_toks,sent.token_iter()))doc.clear()# remove periods on lines by themselvesdoc.sents=tuple(filter(filter_sents,doc.sents))doc.clear()# chunk enumerated and itemized lists into sentences (if any)ifself.min_list_norm_matches>0:doc=self._norm_list(doc)# replace mangled token norms from original textdoc=self._clone_norm_doc(doc)ifdoc.token_len>0:doc.text=parent.text[doc.lexspan.begin:doc.lexspan.end]ifdoc.token_len>0:doc.reindex()returndoc
[docs]defcreate(self,sec:Section)->Iterable[FeatureDocument]:include_headers:bool=self.include_section_headersparent:FeatureDocument=sec.container.docdoc:FeatureDocumentspan:LexicalSpanifinclude_headers:doc,span=sec.doc,sec.lexspanelse:doc,span=sec.body_doc,sec.body_spanassertisinstance(doc,FeatureDocument)# some section data is in the header, and thus, has no bodyiflen(doc.sents)==0:return[]# chunk sections into paragraphspc=ParagraphChunker(pattern=self.MIMIC_SPAN_PATTERN,doc=parent.clone(),sub_doc=doc,char_offset=span.begin)# normalize documents and prune empty (resulting from pruned sententces)returnfilter(lambdad:disnotNone,map(lambdad:self._norm_doc(parent,d),pc))