medkit.text.segmentation

medkit.text.segmentation#

Submodules#

Classes#

`SectionModificationRule`
`SectionTokenizer`	Section segmentation annotator based on keyword rules.
`SentenceTokenizer`	Sentence segmentation annotator based on end punctuation rules.
`SyntagmaTokenizer`	Syntagma segmentation annotator based on provided separators.

Package Contents#

class medkit.text.segmentation.SectionModificationRule#

section_name: str#

new_section_name: str#

other_sections: list[str]#

order: typing_extensions.Literal[BEFORE, AFTER]#

class medkit.text.segmentation.SectionTokenizer(section_dict: dict[str, list[str]] | None = None, output_label: str = _DEFAULT_LABEL, section_rules: Iterable[SectionModificationRule] = (), strip_chars: str = _DEFAULT_STRIP_CHARS, uid: str | None = None)#

Bases: medkit.core.text.SegmentationOperation

Section segmentation annotator based on keyword rules.

Parameters:

section_dict: dict of str to list of str, optional: Dictionary containing the section name as key and the list of mappings as value. If None, the content of default_section_definition.yml will be used.
output_label: str, optional: Segment label to use for annotation output.
section_rules: iterable of SectionModificationRule, optional: List of rules for modifying a section name according its order to the other sections. If section_dict is None, the content of default_section_definition.yml will be used.
strip_chars: str, optional: The list of characters to strip at the beginning of the returned segment.
uid: str, optional: Identifier of the tokenizer

_DEFAULT_LABEL: str = 'section'#

_DEFAULT_STRIP_CHARS: str = Multiline-String#

Show Value

""".;,?!

       """

init_args#

output_label#

strip_chars#

section_dict#

section_rules#

keyword_processor#

run(segments: list[medkit.core.text.Segment]) → list[medkit.core.text.Segment]#

Return sections detected in segments.

Each section is a segment with an attached attribute (label: <same as self.output_label>, value: <the name of the section>).

Parameters:

segments: list of Segment: List of segments into which to look for sections

Returns:

list of Segment: Sections segments found in segments

_find_sections_in_segment(segment: medkit.core.text.Segment)#

_get_sections_to_rename(match: list[tuple])#

classmethod get_example()#

static load_section_definition(filepath: pathlib.Path, encoding: str | None = None) → tuple[dict[str, list[str]], tuple[SectionModificationRule, Ellipsis]]#

Load the sections definition stored in a yml file.

Parameters:

filepathPath: Path to a yml file containing the sections(name + mappings) and rules
encodingstr, optional: Encoding of the file to open

Returns:

tuple: Tuple containing: - the dictionary where key is the section name and value is the list of all equivalent strings. - the list of section modification rules. These rules allow to rename some sections according their order

static save_section_definition(section_dict: dict[str, list[str]], section_rules: Iterable[SectionModificationRule], filepath: pathlib.Path, encoding: str | None = None)#

Save section yaml definition file.

Parameters:

section_dictdict of str to list of str: Dictionary containing the section name as key and the list of mappings as value (cf. content of default_section_dict.yml as example)
section_rulesiterable of SectionModificationRule: List of rules for modifying a section name according its order to the other sections.
filepathPath: Path to the file to save
encodingstr, optional: File encoding

class medkit.text.segmentation.SentenceTokenizer(output_label: str = _DEFAULT_LABEL, punct_chars: tuple[str, Ellipsis] = _DEFAULT_PUNCT_CHARS, keep_punct: bool = False, split_on_newlines: bool = True, attrs_to_copy: list[str] | None = None, uid: str | None = None)#

Bases: medkit.core.text.SegmentationOperation

Sentence segmentation annotator based on end punctuation rules.

_DEFAULT_LABEL = 'sentence'#

_DEFAULT_PUNCT_CHARS = ('.', ';', '?', '!')#

init_args#

output_label#

punct_chars#

keep_punct#

split_on_newlines#

attrs_to_copy#

_newline_pattern#

punct_string#

_punct_pattern#

run(segments: list[medkit.core.text.Segment]) → list[medkit.core.text.Segment]#

Return sentences detected in segments.

Parameters:

segmentslist of Segment: List of segments into which to look for sentences

Returns:

list of Segment: Sentences segments found in segments

_find_sentences_in_segment(segment: medkit.core.text.Segment) → Iterator[medkit.core.text.Segment]#

static _split_text(text: str, pattern: re.Pattern, keep_separator: bool) → Iterator[tuple[int, int]]#

_build_sentence(source_segment: medkit.core.text.Segment, range_: tuple[int, int]) → medkit.core.text.Segment#

class medkit.text.segmentation.SyntagmaTokenizer(separators: tuple[str, Ellipsis] | None = None, output_label: str = _DEFAULT_LABEL, strip_chars: str = _DEFAULT_STRIP_CHARS, attrs_to_copy: list[str] | None = None, uid: str | None = None)#

Bases: medkit.core.text.SegmentationOperation

Syntagma segmentation annotator based on provided separators.

_DEFAULT_LABEL = 'syntagma'#

_DEFAULT_STRIP_CHARS = Multiline-String#

Show Value

""".;,?!

       """

init_args#

output_label#

strip_chars#

separators#

attrs_to_copy#

run(segments: list[medkit.core.text.Segment]) → list[medkit.core.text.Segment]#

Return syntagmes detected in segments.

Parameters:

segmentslist of Segment: List of segments into which to look for sentences

Returns:

list of Segment: Syntagmas segments found in segments

_find_syntagmas_in_segment(segment: medkit.core.text.Segment) → Iterator[medkit.core.text.Segment]#

classmethod get_example()#

static load_syntagma_definition(filepath: pathlib.Path, encoding: str | None = None) → tuple[str, Ellipsis]#

Load the syntagma definition stored in yml file.

Parameters:

filepathPath: Path to a yml file containing the syntagma separators
encodingstr, optional: Encoding of the file to open

Returns:

tuple of str: Tuple containing the separators

static save_syntagma_definition(syntagma_seps: tuple[str, Ellipsis], filepath: pathlib.Path, encoding: str | None = None)#

Save syntagma yaml definition file.

Parameters:

syntagma_sepstuple of str: The tuple of regular expressions corresponding to separators
filepathPath: The path of the file to save
encodingstr, optional: The encoding of the file. Default: None