medkit.text.segmentation.sentence_tokenizer#

Classes#

SentenceTokenizer

Sentence segmentation annotator based on end punctuation rules.

Module Contents#

class medkit.text.segmentation.sentence_tokenizer.SentenceTokenizer(output_label: str = _DEFAULT_LABEL, punct_chars: tuple[str, Ellipsis] = _DEFAULT_PUNCT_CHARS, keep_punct: bool = False, split_on_newlines: bool = True, attrs_to_copy: list[str] | None = None, uid: str | None = None)#

Bases: medkit.core.text.SegmentationOperation

Sentence segmentation annotator based on end punctuation rules.

_DEFAULT_LABEL = 'sentence'#
_DEFAULT_PUNCT_CHARS = ('.', ';', '?', '!')#
init_args#
output_label#
punct_chars#
keep_punct#
split_on_newlines#
attrs_to_copy#
_newline_pattern#
punct_string#
_punct_pattern#
run(segments: list[medkit.core.text.Segment]) list[medkit.core.text.Segment]#

Return sentences detected in segments.

Parameters:
segmentslist of Segment

List of segments into which to look for sentences

Returns:
list of Segment

Sentences segments found in segments

_find_sentences_in_segment(segment: medkit.core.text.Segment) Iterator[medkit.core.text.Segment]#
static _split_text(text: str, pattern: re.Pattern, keep_separator: bool) Iterator[tuple[int, int]]#
_build_sentence(source_segment: medkit.core.text.Segment, range_: tuple[int, int]) medkit.core.text.Segment#