pipelines.pipeline

pipelines.pipeline(
    vad_model,
    emissions_model,
    transcription_model,
    audio_paths,
    audio_dir,
    backend='ct2',
    speeches=None,
    metadata=None,
    sample_rate=16000,
    chunk_size=30,
    alignment_strategy='chunk',
    text_normalizer_fn=text_normalizer,
    tokenizer=None,
    language=None,
    task='transcribe',
    beam_size=5,
    max_length=250,
    max_new_tokens=256,
    repetition_penalty=1.0,
    length_penalty=1.0,
    patience=1.0,
    no_repeat_ngram_size=0,
    punctuation=True,
    generate_kwargs=None,
    start_wildcard=False,
    end_wildcard=False,
    blank_id=None,
    word_boundary=None,
    indent=2,
    ndigits=5,
    num_workers_files=2,
    prefetch_factor_files=2,
    batch_size_features=8,
    num_workers_features=4,
    streaming=True,
    save_json=True,
    save_msgpack=False,
    save_emissions=True,
    return_alignments=False,
    delete_emissions=False,
    output_vad_dir='output/vad',
    output_transcriptions_dir='output/transcriptions',
    output_emissions_dir='output/emissions',
    output_alignments_dir='output/alignments',
    cache_dir='models',
    hf_token=None,
    device='cuda',
)

Run the full transcription pipeline (VAD -> Transcribe -> Emissions -> Align).

Parameters

Name	Type	Description	Default
vad_model	str	Voice Activity Detection model: “pyannote” or “silero”.	required
emissions_model	str	Hugging Face model ID for the emissions model (“org_name/model_name”).	required
transcription_model	str	Path to Hugging Face model ID for the transcription model (“org_name/model_name”).	required
audio_paths	list	List of audio file paths.	required
audio_dir	str	Directory containing audio files.	required
speeches	list[list[SpeechSegment]]	Existing speech segments for alignment.	`None`
metadata	list[dict] or None	Optional list of file-level metadata dicts (one per audio file, same order as `audio_paths`). The metadata is attached to each file’s `AudioMetadata` during VAD and propagates unchanged through transcription, emissions, and alignment, so it is retained in the final output.	`None`
backend	str	Backend to use for the transcription model: “ct2”, “hf”, or “cohere”. Default is “ct2”. The “cohere” backend requires `transformers>=5.4.0`, `streaming=True`, and an explicit `language` (Cohere has no language detection).	`'ct2'`
sample_rate	int	Sample rate.	`16000`
chunk_size	int	Chunk size in seconds.	`30`
alignment_strategy	str	Alignment strategy (‘speech’ or ‘chunk’).	`'chunk'`
text_normalizer_fn	callable	Function to normalize text before forced alignment.	`text_normalizer`
tokenizer	object	An `nltk` tokenizer or a custom callable tokenizer that takes a string as input and returns a list of tuples (start_char, end_char), marking the spans/boundaries of sentences, paragraphs, or any other text unit of interest.	`None`
beam_size	int	Number of beams for beam search. Recommended: `5` for ct2 and `1` for hf (beam search is slow in Hugging Face transformers).	`5`
patience	float	Patience. Only implemented in ct2.	`1.0`
length_penalty	float	Length penalty for beam search. See HF source code for details	`1.0`
repetition_penalty	float	See HF source code for details.	`1.0`
max_length	int	Maximum length of generated text. Applies to Whisper backends (ct2, hf).	`250`
max_new_tokens	int	Maximum number of new tokens to generate per chunk. Applies to the cohere backend.	`256`
punctuation	bool	Emit punctuation in Cohere transcriptions. Applies to the cohere backend only.	`True`
generate_kwargs	dict	Extra kwargs forwarded to `model.generate()` for the cohere backend (e.g. `num_beams`, `length_penalty`).	`None`
start_wildcard	bool	Add start wildcard to forced alignment.	`False`
end_wildcard	bool	Add end wildcard to forced alignment.	`False`
blank_id	int \| None	Blank token ID of the emissions model (generally the pad token ID).	`None`
word_boundary	str \| None	Word boundary character of the emissions model (usually “\|”).	`None`
indent	int	JSON indentation.	`2`
ndigits	int	Number of digits for rounding.	`5`
num_workers_files	int	Number of workers for file loading.	`2`
prefetch_factor_files	int	Prefetch factor for files.	`2`
batch_size_features	int	Batch size for feature extraction.	`8`
num_workers_features	int	Number of workers for feature extraction.	`4`
streaming	bool	Use streaming mode.	`True`
save_json	bool	Save results to JSON.	`True`
save_msgpack	bool	Save results to MessagePack.	`False`
save_emissions	bool	Save emissions.	`True`
return_alignments	bool	Return alignment results.	`False`
delete_emissions	bool	Whether to delete emissions numpy files after processing.	`False`
output_vad_dir	str	Output directory for VAD.	`'output/vad'`
output_transcriptions_dir	str	Output directory for transcriptions.	`'output/transcriptions'`
output_emissions_dir	str	Output directory for emissions.	`'output/emissions'`
output_alignments_dir	str	Output directory for alignments.	`'output/alignments'`
cache_dir	str	Cache directory for transcription and emissions models.	`'models'`
hf_token	str or None	Hugging Face authentication token for gated models.	`None`
device	str	Device to run models on. Default is `cuda`.	`'cuda'`

Examples

from pathlib import Path
from easyaligner.text import load_tokenizer
from easytranscriber.pipelines import pipeline
from easytranscriber.text.normalization import text_normalizer

tokenizer = load_tokenizer("english")
audio_files = [file.name for file in Path("data/en").glob("*.wav")]
pipeline(
    vad_model="pyannote",
    emissions_model="facebook/wav2vec2-base-960h",
    transcription_model="distil-whisper/distil-large-v3.5",
    audio_paths=audio_files,
    audio_dir="data/en",
    backend="ct2",
    language="en", # None to perform language detection
    tokenizer=tokenizer,
    text_normalizer_fn=text_normalizer,
    cache_dir="models",
)

Returns

Name	Type	Description
	list[list[SpeechSegment]] or None	If `return_alignments` is True, returns a list of alignment mappings for each audio file. Otherwise, returns `None` (the alignments are saved to disk only).