medkit.text.segmentation.syntagma_tokenizer

medkit.text.segmentation.syntagma_tokenizer#

Classes#

SyntagmaTokenizer

Syntagma segmentation annotator based on provided separators.

Module Contents#

class medkit.text.segmentation.syntagma_tokenizer.SyntagmaTokenizer(separators: tuple[str, Ellipsis] | None = None, output_label: str = _DEFAULT_LABEL, strip_chars: str = _DEFAULT_STRIP_CHARS, attrs_to_copy: list[str] | None = None, uid: str | None = None)#

Bases: medkit.core.text.SegmentationOperation

Syntagma segmentation annotator based on provided separators.

_DEFAULT_LABEL = 'syntagma'#

_DEFAULT_STRIP_CHARS = Multiline-String#

Show Value

""".;,?!

       """

init_args#

output_label#

strip_chars#

separators#

attrs_to_copy#

run(segments: list[medkit.core.text.Segment]) → list[medkit.core.text.Segment]#

Return syntagmes detected in segments.

Parameters:

segmentslist of Segment: List of segments into which to look for sentences

Returns:

list of Segment: Syntagmas segments found in segments

_find_syntagmas_in_segment(segment: medkit.core.text.Segment) → Iterator[medkit.core.text.Segment]#

classmethod get_example()#

static load_syntagma_definition(filepath: pathlib.Path, encoding: str | None = None) → tuple[str, Ellipsis]#

Load the syntagma definition stored in yml file.

Parameters:

filepathPath: Path to a yml file containing the syntagma separators
encodingstr, optional: Encoding of the file to open

Returns:

tuple of str: Tuple containing the separators

static save_syntagma_definition(syntagma_seps: tuple[str, Ellipsis], filepath: pathlib.Path, encoding: str | None = None)#

Save syntagma yaml definition file.

Parameters:

syntagma_sepstuple of str: The tuple of regular expressions corresponding to separators
filepathPath: The path of the file to save
encodingstr, optional: The encoding of the file. Default: None