Source code for zensols.mednlp.app

"""A natural language medical domain parsing library.

"""
__author__ = 'Paul Landes'

from typing import Optional
from dataclasses import dataclass, field
from enum import Enum, auto
import sys
import logging
from io import TextIOBase
import re
from pprint import pprint
from pathlib import Path
import pandas as pd
from zensols.config import Dictable, ConfigFactory
from zensols.cli import ApplicationError
from zensols.nlp import FeatureDocumentParser, FeatureDocument
from zensols.nlp.dataframe import FeatureDataFrameFactory
from . import MedCatResource, MedicalLibrary

logger = logging.getLogger(__name__)



[docs]
class GroupInfo(Enum):
    """Used to group TUI information in :meth:`.Application.group`

    """
    csv = auto()
    byname = auto()




[docs]
@dataclass
class Application(Dictable):
    """A natural language medical domain parsing library.

    """
    config_factory: ConfigFactory = field()
    """Used to create a cTAKES stash."""

    doc_parser: FeatureDocumentParser = field()
    """Parses and NER tags medical terms."""

    library: MedicalLibrary = field()
    """Medical resource library that contains UMLS access, cui2vec etc.."""

    def _get_text(self, text_or_file: str) -> str:
        """Return the text from a file or the text passed based on if
        ``text_or_file`` is a file on the file system.

        """
        path = Path(text_or_file)
        if path.is_file():
            with open(path) as f:
                text_or_file = f.read()
        return text_or_file

    def _write_doc(self, doc: FeatureDocument, only_medical: bool,
                   depth: int = 0, writer: TextIOBase = sys.stdout):
        for sent in doc.sents:
            if len(sent.text.strip()) == 0:
                continue
            self._write_line(sent.text, depth, writer)
            for tok in sent:
                if not only_medical or tok.is_ent:
                    self._write_line(f'{tok.norm}:', depth + 1, writer)
                    tok.write_attributes(
                        depth + 2, writer,
                        feature_ids=self.doc_parser.token_feature_ids
                    )
            self._write_line('entities:', depth, writer)
            for ents in sent.entities:
                self._write_line(
                    ' '.join(map(lambda e: e.norm, ents)), depth + 1, writer)


[docs]
    def show(self, text_or_file: str, only_medical: bool = False):
        """Parse and output medical entities.

        :param text_or_file: natural language to be processed

        :param only_medical: only provide medical linked tokens

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'parsing: <{text_or_file}>...')
        text: str = self._get_text(text_or_file)
        doc: FeatureDocument = self.doc_parser.parse(text)
        self._write_doc(doc, only_medical)


    def _output_dataframe(self, df: pd.DataFrame, out: Optional[Path] = None):
        """Output the dataframe generated by other actions of the app.

        :param df: the dataframe to output:

        :param out: the output path, or ``None`` standard out
        """
        if out is None:
            out = sys.stdout
        df.to_csv(out, index=False)
        row_s = 's' if len(df) != 1 else ''
        if out != sys.stdout:
            logger.info(f'wrote {len(df)} row{row_s} to {out}')


[docs]
    def features(self, text_or_file: str, out: Path = None, ids: str = None,
                 only_medical: bool = False):
        """Dump features as CSV output.

        :param text_or_file: natural language to be processed

        :param out: the path to output the CSV file or stdout if missing

        :param ids: the comma separate feature IDs to output

        :param only_medical: only provide medical linked tokens

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'parsing: <{text_or_file}>...')
        params = {}
        if ids is None:
            ids = self.doc_parser.token_feature_ids
        else:
            ids = set(re.split(r'\W+', ids))
        needs = 'norm cui_ is_concept'.split()
        missing = set(filter(lambda i: i not in ids, needs))
        ids |= missing
        params['token_feature_ids'] = ids
        params['priority_feature_ids'] = needs
        df_fac = FeatureDataFrameFactory(**params)
        self.doc_parser.token_feature_ids = ids
        text: str = self._get_text(text_or_file)
        doc: FeatureDocument = self.doc_parser.parse(text)
        df: pd.DataFrame = df_fac(doc)
        if only_medical:
            df = df[df['is_concept']]
        self._output_dataframe(df, out)



[docs]
    def search(self, term: str):
        """Search the UMLS database using UTS and show results.

        :param term: the term to search for (eg 'lung cancer')

        """
        pprint(self.library.uts_client.search_term(term))



[docs]
    def atom(self, cui: str):
        """Search the UMLS database using UTS and show results.

        :param cui: the concept ID to search for (eg 'C0242379')

        """
        pprint(self.library.uts_client.get_atoms(cui))



[docs]
    def define(self, cui: str):
        """Look up an entity by CUI.  This takes a long time.

        :param cui: the concept ID to search for (eg 'C0242379')

        """
        entity = self.library.get_linked_entity(cui)
        print(entity)



[docs]
    def group(self, info: GroupInfo, query: str = None):
        """Get TUI group information.

        :param info: the type of information to return

        :param query: comma delimited name list used to subset the output data

        """
        res: MedCatResource = self.library.medcat_resource
        df: pd.DataFrame = res.groups
        if info == GroupInfo.csv:
            path = Path('tui-groups.csv')
            df.to_csv(path)
            logger.info(f'wrote TUI groups to {path}')
        elif info == GroupInfo.byname:
            if query is None:
                raise ApplicationError('Missing query string for grouping')
            reg = '.*(' + '|'.join(query.split(',')) + ')'
            df = df[df['name'].str.match(reg)]
            print(','.join(df['tui'].tolist()))
        else:
            raise ApplicationError(f'Unknown query info type: {info}')



[docs]
    def ctakes(self, text_or_file: str, only_medical: bool = False):
        """Invoke cTAKES on a directory with text files.

        :param text_or_file: natural language to be processed

        :param only_medical: only provide medical linked tokens

        """
        from .ctakes import CTakesParserStash
        text: str = self._get_text(text_or_file)
        stash: CTakesParserStash = self.library.get_new_ctakes_parser_stash()
        stash.set_documents([text])
        print(stash['0'].to_string())



[docs]
    def similarity(self, term: str):
        """Get the cosine similarity between two CUIs.

        """
        for sim in self.library.similarity_by_term(term):
            print(sim.cui)
            sim.write(1)