from pathlib import Path
from transformers import (
AutoModelForCTC,
Wav2Vec2Processor,
)
from huggingface_hub import snapshot_download
from easyaligner.text import load_tokenizer
from easyaligner.data.datamodel import SpeechSegment
from easyaligner.pipelines import pipeline
from easyaligner.text import text_normalizer
from easyaligner.vad.pyannote import load_vad_model
snapshot_download(
"Lauler/easytranscriber_tutorials",
repo_type="dataset",
local_dir="data/tutorials",
allow_patterns="tale-of-two-cities_align-en/*",
)
text = """
It was the best of times, it was the worst of times, it was the age of
wisdom, it was the age of foolishness, it was the epoch of belief, it
was the epoch of incredulity, it was the season of Light, it was the
season of Darkness, it was the spring of hope, it was the winter of
despair, we had everything before us, we had nothing before us, we were
all going direct to Heaven, we were all going direct the other way--in
short, the period was so far like the present period, that some of its
noisiest authorities insisted on its being received, for good or for
evil, in the superlative degree of comparison only.
"""
text = text.strip()
# The alignments will be organized according to how the text is tokenized
tokenizer = load_tokenizer(language="english") # sentence tokenizer
span_list = list(tokenizer.span_tokenize(text)) # start, end character indices for each sentence
speeches = [[SpeechSegment(speech_id=0, text=text, text_spans=span_list, start=None, end=None)]]
# Load models and run pipeline
model_vad = load_vad_model()
model = (
AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda").half()
)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# File(s) to align
audio_files = [file.name for file in Path("data/tutorials/tale-of-two-cities_align-en").glob("*")]
pipeline(
vad_model=model_vad,
emissions_model=model,
processor=processor,
audio_paths=audio_files,
audio_dir="data/tutorials/tale-of-two-cities_align-en",
speeches=speeches,
alignment_strategy="speech",
text_normalizer_fn=text_normalizer,
tokenizer=tokenizer,
start_wildcard=True,
end_wildcard=True,
blank_id=processor.tokenizer.pad_token_id,
word_boundary="|",
)Overview
TODO: Use shorter text for the tutorial that can just be pasted in the example.
Installation
With GPU support
pip install easyaligner --extra-index-url https://download.pytorch.org/whl/cu128Using uv
When installing with uv, it will select the appropriate PyTorch version automatically (CPU for macOS, CUDA for Linux/Windows/ARM):
uv pip install easyalignerUsage
Demo
The text transcript below the audio player is highlighted in sync with the words spoken in the audio.
Sample audio
A Tale of Two Cities — Chapter 1 (LibriVox)
Tip
You can click anywhere in the text to jump to that point in the audio. The text is also highlighted when you drag the audio slider!