medkit.io.brat#

Classes#

BratInputConverter

Class in charge of converting brat annotations.

BratOutputConverter

Class for converting text documents to a brat collection file.

Module Contents#

class medkit.io.brat.BratInputConverter(detect_cuis_in_notes: bool = True, notes_label: str = 'brat_note', uid: str | None = None)#

Bases: medkit.core.InputConverter

Class in charge of converting brat annotations.

Parameters:
detect_cuis_in_notesbool, default=True

If True, strings looking like CUIs in annotator notes of entities will be converted to UMLS normalization attributes rather than creating an Attribute with the whole note text as value.

notes_labelstr, default=ā€brat_noteā€,

Label to use for attributes created from annotator notes.

uidstr, optional

Identifier of the converter.

Attributes:
descriptionstr

Description of the operation

notes_label#
detect_cuis_in_notes#
uid#
_prov_tracer: medkit.core.ProvTracer | None = None#
property description: medkit.core.OperationDescription#
set_prov_tracer(prov_tracer: medkit.core.ProvTracer)#
load(dir_path: str | pathlib.Path, ann_ext: str = ANN_EXT, text_ext: str = TEXT_EXT) list[medkit.core.text.TextDocument]#

Load brat annotations as text documents.

Create a list of TextDocuments from a folder containing text files and associated brat annotations files.

Parameters:
dir_pathstr or Path

The path to the directory containing the text files and the annotation files (.ann)

ann_extstr, optional

The extension of the brat annotation file (e.g. .ann)

text_extstr, optional

The extension of the text file (e.g. .txt)

Returns:
list of TextDocument

The list of TextDocuments

load_doc(ann_path: str | pathlib.Path, text_path: str | pathlib.Path) medkit.core.text.TextDocument#

Load a brat annotation and text file combo as a text document.

Create a TextDocument from a .ann file and its associated .txt file.

Parameters:
ann_pathstr or Path

The path to the brat annotation file.

text_pathstr or Path

The path to the text document file.

Returns:
TextDocument

The document containing the text and the annotations

load_annotations(ann_file: str | pathlib.Path) list[medkit.core.text.TextAnnotation]#

Load a brat annotation file as a list of annotations.

Load a .ann file and return a list of Annotation objects.

Parameters:
ann_filestr or Path

Path to the .ann file.

Returns:
list of TextAnnotation

The list of text annotations

class medkit.io.brat.BratOutputConverter(anns_labels: list[str] | None = None, attrs: list[str] | None = None, notes_label: str = 'brat_note', ignore_segments: bool = True, convert_cuis_to_notes: bool = True, create_config: bool = True, top_values_by_attr: int = 50, uid: str | None = None)#

Bases: medkit.core.OutputConverter

Class for converting text documents to a brat collection file.

Hint

BRAT checks for coherence between span and text for each annotation. This converter adjusts the text and spans to get the right visualization and ensure compatibility.

Parameters:
anns_labelslist of str, optional

Labels of medkit annotations to convert into Brat annotations. If None (default) all the annotations will be converted

attrslist of str, optional

Labels of medkit attributes to add in the annotations that will be included. If None (default) all medkit attributes found in the segments or relations will be converted to Brat attributes

notes_labelstr, default=ā€brat_noteā€

Label of attributes that will be converted to annotator notes.

ignore_segmentsbool, default=True

If True medkit segments will be ignored. Only entities, attributes and relations will be converted to Brat annotations. If False the medkit segments will be converted to Brat annotations as well.

convert_cuis_to_notesbool, default=True

If True, UMLS normalization attributes will be converted to annotator notes rather than attributes. For entities with multiple UMLS attributes, CUIs will be separated by spaces (ex: ā€œC0011849 C0004096ā€).

create_configbool, default=True

Whether to create a configuration file for the generated collection. This file defines the types of annotations generated, it is necessary for the correct visualization on Brat.

top_values_by_attrint, default=50

Defines the number of most common values by attribute to show in the configuration. This is useful when an attribute has a large number of values, only the ā€˜topā€™ ones will be in the config. By default, the top 50 of values by attr will be in the config.

uidstr, optional

Identifier of the converter

Attributes:
descriptionstr

Description for the operation

uid#
anns_labels#
attrs#
notes_label#
ignore_segments#
convert_cuis_to_notes#
create_config#
top_values_by_attr#
property description: medkit.core.OperationDescription#
save(docs: list[medkit.core.text.TextDocument], dir_path: str | pathlib.Path, doc_names: list[str] | None = None)#

Save text documents as brat files.

Convert and save a collection or list of TextDocuments into a Brat collection. For each collection or list of documents, a folder is created with ā€˜.txtā€™ and ā€˜.annā€™ files. A file named ā€˜annotation.confā€™ may also be saved if required.

Parameters:
docslist of TextDocument

List of medkit doc objects to convert

dir_pathstr or Path

String or path object to save the generated files

doc_nameslist of str, optional

Optional list with the names for the generated files. If ā€˜Noneā€™, ā€˜uidā€™ will be used as the name. Where ā€˜uid.txtā€™ has the raw text of the document and ā€˜uid.annā€™ the Brat annotation file.

_convert_medkit_anns_to_brat(segments: list[medkit.core.text.Segment], relations: list[medkit.core.text.Relation], config: medkit.io._brat_utils.BratAnnConfiguration, raw_text: str) list[medkit.io._brat_utils.BratEntity | medkit.io._brat_utils.BratAttribute | medkit.io._brat_utils.BratRelation | medkit.io._brat_utils.BratNote]#

Convert Segments, Relations and Attributes into brat data structures.

Parameters:
segmentslist of Segment

Medkit segments to convert

relationslist of Relation

Medkit relations to convert

configBratAnnConfiguration

Optional BratAnnConfiguration structure, this object is updated with the types of the generated Brat annotations.

raw_textstr

Text of reference to get the original text of the annotations

Returns:
list of BratEntity or BratAttribute or BratRelation or BratNote

A list of brat annotations

static _ensure_text_and_spans(segment: medkit.core.text.Segment, raw_text: str) tuple[str, list[tuple[int, int]]]#

Ensure consistency between the segment and the raw text.

The text of a BRAT annotation canā€™t contain multiple white spaces (including a newline character). This method cleans the fragmentsā€™ text and adjust its spans to point to the same location in the raw text.

Parameters:
segmentSegment

Segment to ensure

raw_textstr

Text of reference

Returns:
textstr

The cleaned text

spanslist of tuple

The adjusted spans

_convert_segment_to_brat(segment: medkit.core.text.Segment, nb_segment: int, raw_text: str) medkit.io._brat_utils.BratEntity#

Get a brat entity from a medkit segment.

Parameters:
segmentSegment

A medkit segment to convert into brat format

nb_segmentint

The current counter of brat segments

raw_textstr

Text of reference to get the original text of the segment

Returns:
BratEntity

The equivalent brat entity of the medkit segment

static _convert_relation_to_brat(relation: medkit.core.text.Relation, nb_relation: int, brat_entities_by_segment_id: dict[str, medkit.io._brat_utils.BratEntity]) tuple[medkit.io._brat_utils.BratRelation, medkit.io._brat_utils.RelationConf]#

Get a brat relation from a medkit relation.

Parameters:
relationRelation

A medkit relation to convert into brat format

nb_relationint

The current counter of brat relations

brat_entities_by_segment_iddict of str to BratEntity

A dict to map medkit ID to brat annotation

Returns:
relationBratRelation

The equivalent brat relation of the medkit relation

configRelationConf

Configuration of the brat attribute

Raises:
ValueError

When the source or target was not found in the mapping object

static _convert_attribute_to_brat(label: str, value: str | None, nb_attribute: int, target_brat_id: str, is_from_entity: bool) tuple[medkit.io._brat_utils.BratAttribute, medkit.io._brat_utils.AttributeConf]#

Get a brat attribute from a medkit attribute.

Parameters:
labelstr

Attribute label to convert into brat format

valuestr, optional

Attribute value

nb_attributeint

The current counter of brat attributes

target_brat_idstr

Corresponding target brat ID

Returns:
attributeBratAttribute

The equivalent brat attribute of the medkit attribute

configAttributeConf

Configuration of the brat attribute

static _convert_umls_attributes_to_brat_note(cuis: list[str], nb_note: int, target_brat_id: str) medkit.io._brat_utils.BratNote#

Get a brat note from a medkit umls norm attribute.

Parameters:
cuislist of str

CUI to convert to brat note

nb_noteint

The current counter of brat notes

target_brat_idstr

Corresponding target brat ID

Returns:
BratNote

The equivalent brat note of the medkit umls attribute

static _convert_attributes_to_brat_note(values: list[Any], nb_note: int, target_brat_id: str) medkit.io._brat_utils.BratNote#

Get a brat note from medkit attribute values.

Parameters:
valueslist of Any

Attribute values

nb_noteint

The current counter of brat notes

target_brat_idstr

Corresponding target brat ID

Returns:
BratNote

The equivalent brat note of the medkit attribute values