Audio transcription#
This demo shows how to transcribe an audio document and then perform text operations on it.
Init audio document#
Instantiate an AudioDocument
with a
FileAudioBuffer
:
from pathlib import Path
import IPython.display
from medkit.core.audio import AudioDocument, FileAudioBuffer
audio_file = Path("input/voice.ogg")
audio_doc = AudioDocument(audio=FileAudioBuffer(audio_file))
IPython.display.Audio(data=audio_doc.audio.read(), rate=audio_doc.audio.sample_rate)
Voice detection#
Prepare pipeline to perform voice detection on audio documents, using a
Downmixer
chained with a
WebRTCVoiceDetector
(you can
also use other segmentation operations such as
PASpeakerDetector
):
from medkit.core import Pipeline, PipelineStep, DocPipeline
from medkit.audio.preprocessing import Downmixer
from medkit.audio.segmentation.webrtc_voice_detector import WebRTCVoiceDetector
# init operations
downmixer = Downmixer(output_label="mono")
voice_detector = WebRTCVoiceDetector(output_label="voice")
# put them in a pipeline
audio_pipeline = Pipeline(
steps=[
PipelineStep(
downmixer,
input_keys=["full_audio"],
output_keys=["full_mono_audio"],
),
PipelineStep(
voice_detector,
input_keys=["full_mono_audio"],
output_keys=["voice_segs"],
),
],
input_keys=["full_audio"],
output_keys=["voice_segs"],
)
# wrap pipeline in doc-level pipeline
audio_doc_pipeline = DocPipeline(audio_pipeline)
Run voice detection on audio document:
audio_doc_pipeline.run([audio_doc])
for seg in audio_doc.anns.get(label="voice"):
print(f"label={seg.label}, span={seg.span}")
Transcription#
Prepare a DocTranscriber
that will convert audio
documents to text documents, using
HFTranscriber
as the actual audio
transcriber creating text segments from audio segments (you can also use other
transcription operations such as
SBTranscriber
):
from medkit.audio.transcription import DocTranscriber
from medkit.audio.transcription.hf_transcriber import HFTranscriber
transcriber = HFTranscriber(
model="openai/whisper-small",
language="english",
add_trailing_dot=False,
capitalize=False,
)
doc_transcriber = DocTranscriber(
input_label="voice",
output_label="transcription",
transcription_operation=transcriber,
)
Transcribe audio document:
transcribed_doc = doc_transcriber.run([audio_doc])[0]
print(f"fulltext={transcribed_doc.text!r}", end="\n\n")
for seg in transcribed_doc.anns.get(label="transcription"):
print(f"label={seg.label}, text={seg.text!r}")
fulltext=' I have headaches.\n I also have high blood pressure.'
label=transcription, text=' I have headaches.'
label=transcription, text=' I also have high blood pressure.'
Entity matching on text#
Run text entity matching on transcribed document:
from medkit.core.text import TextDocument
from medkit.text.ner import RegexpMatcher, RegexpMatcherRule
rules = [
RegexpMatcherRule(label="problem", regexp=r"\bheadaches?\b"),
RegexpMatcherRule(label="problem", regexp=r"\bhigh\s+blood\s+pressure\b"),
]
matcher = RegexpMatcher(rules)
text_pipeline = Pipeline(
steps=[PipelineStep(matcher, input_keys=["full_text"], output_keys=["entities"])],
input_keys=["full_text"],
output_keys=["entities"]
)
text_doc_pipeline = DocPipeline(
text_pipeline,
labels_by_input_key={"full_text": [TextDocument.RAW_LABEL]},
)
text_doc_pipeline.run([transcribed_doc])
Locate matched entities in original audio:
entities = transcribed_doc.anns.get_entities()
for entity in entities:
print(f"label={entity.label}, text={entity.text!r}")
audio_spans = transcribed_doc.get_containing_audio_spans(entity.spans)
print(f"audio_spans={audio_spans}", end="\n\n")
audio = audio_doc.audio.trim_duration(audio_spans[0].start, audio_spans[0].end)
IPython.display.display(IPython.display.Audio(data=audio.read(), rate=audio.sample_rate))
# hardcoded display of audio spans to workaround
# the fact that cells are not executed
print("label=problem, text='headaches'")
entity_1_audio = audio_doc.audio.trim_duration(0.99, 2.73)
IPython.display.display(IPython.display.Audio(data=entity_1_audio.read(), rate=entity_1_audio.sample_rate))
print("label=problem, text='high blood pressure'")
entity_2_audio = audio_doc.audio.trim_duration(6.0, 8.73)
IPython.display.display(IPython.display.Audio(data=entity_2_audio.read(), rate=entity_2_audio.sample_rate))