Text spans#
Here are some examples about usage of span utilities.
from medkit.core.text.span import Span
from medkit.core.text.span_utils import replace, remove, move, extract, insert
raw_text = (
"Cher M. Dupond,\nJ’ai vu en consultation (à mon cabinet le 2019-02-01) "
"Bertrand AGITE, né le 2008-02-25,"
"\n\npour une suspicion de troubles du spectre autistique.\n(-) TDAH.\n"
)
text = raw_text
spans = [Span(0, len(raw_text))]
import re
# replace "M." by "M
# `spans` keeps the modifications
match = re.search(r"M.", text, re.M)
text, spans = replace(text, spans, [match.span()], ["M"])
print(text)
print(spans)
:::
:::{code}
# remove final endline
match = re.search(r"\n\Z", text, re.M)
text, spans = remove(text, spans, [match.span()])
# replace line breaks with spaces
ranges = [m.span() for m in re.finditer(r"\n+", text, re.M)]
text, spans = replace(text, spans, ranges, [" "] * len(ranges))
print(text)
# extract sentences
sentences = []
for match in re.finditer(r"[^\.]+\.", text, re.M):
sentence_text, sentence_spans = extract(text, spans, [match.span()])
sentences.append((sentence_text, sentence_spans))
text_1, spans_1 = sentences[0]
text_2, spans_2 = sentences[1]
print(text_1)
print(text_2)
# move parenthesized text to end in 1st sentence
match = re.search(r" *\((.*)\)", text_1, re.M)
text_1, spans_1 = insert(text_1, spans_1, [len(text_1) - 1], [" ; "])
text_1, spans_1 = move(text_1, spans_1, match.span(1), len(text_1) - 1)
print(text_1)
# reformat dates in 1st sentence
matches = list(re.finditer(r"\d{4}-\d{2}-\d{2}", text_1, re.M))
ranges = [m.span() for m in matches]
new_dates = [
m.group(0)[8:10] + "/" + m.group(0)[5:7] + "/" + m.group(0)[0:4]
for m in matches
]
text_1, spans_1 = replace(text_1, spans_1, ranges, new_dates)
print(text_1)
# replace "(-)" by "negatif" in 2d sentence
match = re.search(r"\(-\)", text_2, re.M)
text_2, spans_2 = replace(text_2, spans_2, [match.span()], ["negatif"])
print(text_2)
# find person entity in 1st sentence
match = re.search(r"M [a-zA-Z]+", text_1)
person_text, person_spans = extract(
text_1, spans_1, [match.span()]
)
# find date entities in 1st sentence
dates = []
for match in re.finditer(r"\d{2}/\d{2}/\d{4}", text_1):
date_text, date_spans = extract(text_1, spans_1, [match.span()])
dates.append((date_text, date_spans))
from medkit.core.text.span_utils import normalize_spans
entities = []
person_spans = normalize_spans(person_spans)
entities.append(("person", person_spans))
for _, date_spans in dates:
date_spans = normalize_spans(date_spans)
entities.append(("date", date_spans))
print(entities)
from spacy import displacy
entities_data = [
{"start": span.start, "end": span.end, "label": label}
for label, spans in entities
for span in spans
]
entities_data = sorted(entities_data, key=lambda e: e["start"])
data = {"text": raw_text, "ents": entities_data, "uuid": 0}
displacy.render(data, manual=True, style="ent", jupyter=True, minify=True)