zensols.spanmatch package#



An API to match spans of semantically similar text across documents.

class zensols.spanmatch.app.Application(doc_parser, matcher)[source]#

An API to match spans of semantically similar text across documents.

__init__(doc_parser, matcher)#
doc_parser: FeatureDocumentParser#

The feature document that normalizes (whitespace) parsed documents.

match(source_file, target_file, output_format=OutputFormat.text, selection=<zensols.introspect.intsel.IntegerSelection object>, output_file=PosixPath('-'), detail=False)[source]#

Match spans across two text files.

  • source_file (Path) – the source match file

  • target_file (Path) – the target match file

  • output_format (OutputFormat) – the format to write the hyperparemters

  • selection (IntegerSelection) – the matches to output

  • output_file (Path) – the output file or - for standard out

  • detail (bool) – whether to output more information

matcher: Matcher#

Used to match spans of text.


Write the matcher’s hyperparameter documentation.


output_format (OutputFormat) – the format to write the hyperparemters

class zensols.spanmatch.app.OutputFormat(value)[source]#

An enumeration.

json = 3#
sphinx = 2#
text = 1#
yaml = 4#
class zensols.spanmatch.app.ProtoApplication(app)[source]#

CLI_META: ClassVar[Dict[str, Any]] = {'is_usage_visible': False}#
app: Application#

Used for REPL prototyping.


Command line entry point to the application.

class zensols.spanmatch.cli.ApplicationFactory(*args, **kwargs)[source]#

__init__(*args, **kwargs)[source]#
zensols.spanmatch.cli.main(args=['/Users/landes/opt/lib/python/bin/sphinx-build', '-M', 'html', '/Users/landes/view/nlp/spanmatch/target/doc/src', '/Users/landes/view/nlp/spanmatch/target/doc/build'], **kwargs)[source]#
Domain and container classes for matching document passages.

exception zensols.spanmatch.domain.DocumentMatchError[source]#

Thrown for any document matching errors.

class zensols.spanmatch.domain.Match(source_tokens=<factory>, target_tokens=<factory>, flow_values=<factory>)[source]#

A span of matching text between two documents.

__init__(source_tokens=<factory>, target_tokens=<factory>, flow_values=<factory>)#
asflatdict(*args, include_norm=False, include_text=False, **kwargs)[source]#

Like asdict() but flatten in to a data structure suitable for writing to JSON or YAML.

flow_values: List[float]#

The values of each word flow.

property source_document: FeatureDocument#

The originating document.

property source_lexspan: LexicalSpan#

The originating document’s lexical span.

property source_span: FeatureSpan#

The originating document’s span.

source_tokens: Set[TokenPoint]#

The originating tokens from the document.

property target_document: FeatureDocument#

The target document.

property target_lexspan: LexicalSpan#

The target document’s lexical span.

property target_span: FeatureSpan#

The target document’s span.

target_tokens: Set[TokenPoint]#

The target tokens from the document

to_str(tokens=False, spans=True, flow=True)[source]#
property total_flow_value: float#

The sum of the flow_values.

write(depth=0, writer=<_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>, include_tokens=False, include_flow=True, char_limit=9223372036854775807)[source]#

class zensols.spanmatch.domain.MatchResult(keys, source_points, target_points, source_tokens, target_tokens, cost, dist, matches=None)[source]#

Contains the lexical text match pairs from the first to the second document given by Matcher.match().

__init__(keys, source_points, target_points, source_tokens, target_tokens, cost, dist, matches=None)#
cost: ndarray#

The earth mover distance solution, which is the cost of transportation from first to the second document.

dist: ndarray#

The distance matrix of all token’s in the embedded space.

property flows: Tuple[WordFlow]#

The Word Mover positional flows.

keys: List[str]#

The :obj:`.TokenPoint.key`s to tokens used to normalize document frequencies in the nBOW.

property mapping: Tuple[WordFlow]#

Like flows but do not duplicate sources

matches: Tuple[Match] = None#

The matching passages between the documents.

source_points: List[TokenPoint]#

The first document’s token points.

source_tokens: Dict[str, List[TokenPoint]]#

The first document’s token points indexed by the TokenPoint.key.

target_points: List[TokenPoint]#

The second document’s token points.

target_tokens: Dict[str, List[TokenPoint]]#

The first document’s token points indexed by the TokenPoint.key.

property transit: np.ndarray#




write(depth=0, writer=<_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>, include_source=True, include_target=True, include_tokens=False, include_mapping=True, match_detail=False)[source]#

class zensols.spanmatch.domain.TokenPoint(token, doc)[source]#

A token and its position in the document and in embedded space.

__init__(token, doc)#
doc: FeatureDocument#

The document that contains token.

property embedding: np.ndarray#

The token embedding.

property key: str#

The key used by Matcher used to index :class:`.WordFlow`s.

property position: float#

The position of the token in the document.

token: FeatureToken#

The token used in document doc used for clustering.

class zensols.spanmatch.domain.WordFlow(value, source_key, target_key, source_tokens, target_tokens)[source]#

The flow of a word between two documents.

__init__(value, source_key, target_key, source_tokens, target_tokens)#
source_key: str#

The TokenPoint.key. of the originating document.

source_tokens: Tuple[TokenPoint]#

The originating tokens that map from source_key.

target_key: str#

The TokenPoint.key. of the target document.

target_tokens: Tuple[TokenPoint]#

The target tokens that map from target_key.

value: float#

The value of flow.

write(depth=0, writer=<_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>, include_tokens=False)[source]#

Implements a method to match sections from documents to one another.

class zensols.spanmatch.match.Matcher(dtype=<class 'numpy.float64'>, hyp=None)[source]#

Creates matching spans of text between two documents by first using the word mover algorithm and then clustering by tokens’ positions in their respective documents.

__init__(dtype=<class 'numpy.float64'>, hyp=None)#

The floating point type used for word mover and clustering.

alias of float64

hyp: HyperparamModel = None#

The model’s hyperparameters.


:param cased: whether or not to treat text as cased
:type cased: bool

:param distance_metric: the default distance metric for
                        calculating the distance from each
                        embedded :class:`.tokenpoint`. :see:
:type distance_metric: str; one of: descendant, ancestor, all, euclidean

:param bidirect_match: whether to order matches by a bidirectional
:type bidirect_match: str; one of: none, norm, sum

:param source_distance_threshold: the source document clustering
                                  threshold distance
:type source_distance_threshold: float

:param target_distance_threshold: the target document clustering
                                  threshold distance
:type target_distance_threshold: float

:param source_position_scale: used to scale the source document
                              positional embedding component
:type source_position_scale: float

:param target_position_scale: used to scale the target document
                              positional embedding component
:type target_position_scale: float

:param min_flow_value: the minimum match flow; any matches that
                       fall below this value are filtered
:type min_flow_value: float

:param min_source_token_span: the minimum source span length in
                              tokens to be considered for matchs
:type min_source_token_span: int

:param min_target_token_span: the minimum target span length in
                              tokens to be considered for matchs
:type min_target_token_span: int
match(source_doc, target_doc)[source]#

Match lexical spans of text from one document to the other.

Return type



the matched document spans from the source to the target document

Module contents#