General Introduction
insanely-fast-whisper is an audio transcription tool that combines OpenAI's Whisper model with various optimization techniques (e.g. Transformers, Optimum, Flash Attention) to provide a command line interface (CLI) designed to transcribe large amounts of audio quickly and efficiently. It uses the Whisper Large v3 model and is capable of transcribing 150 minutes of audio content in less than 98 seconds. Users can learn more details, installation guides and usage help via the GitHub repository.
multi-speaker recognition
pyannote.audio is an open source toolkit for speaker diarization written in Python. Based on the PyTorch machine learning framework, it features state-of-the-art pre-trained models and pipelines to further fine-tune your own data for better performance.
faster-whisper + pyannote.audio implements speaker recognition, in fact, by simply combining the results of the two
Official Warehouse: https://github.com/pyannote/pyannote-audio
Function List
Audio transcription using the Whisper Large v3 model
Utilizes Transformers, Optimum, Flash Attention, and more!
Provides a CLI interface
Support different optimization types and show benchmarks
Using Help
Installation: Installation and configuration with pip
Usage: Pass parameters and run transcription tasks directly from the command line
Get help: Visit the GitHub repository to read the documentation and talk to the community.
https://github.com/SYSTRAN/faster-whisper项目编写的google colab code
# Installing the necessary librariesget_ipython().system('pip install faster-whisper')# Importing the necessary librariesfrom faster_whisper import available_modelsimport torchimport ipywidgets as widgetsfrom IPython.display import display, clear_outputimport os # Import operating system libraries for handling file operationsimport gc # import garbage collection library# Automatically detects device type and selects GPU or CPUdevice = "cuda" if torch.cuda.is_available() else "cpu"model_size = "large-v2" # Default selection of model sizecompute_type = "float16" if device == "cuda" else "float32" # switch to float32 if CPU is used# Get a list of available modelsmodels_list = available_models()# Default Language Listsupported_languages = ['en', 'fr', 'de', 'zh', '...'] # uses the default list of languagesdefault_language = 'zh' if 'zh' in supported_languages else supported_languages[0] # If 'zh' is in the list, use it as the default; otherwise use the first value in the list
# Creating a GUI Interfacemodel_label = widgets.Label('Select model:')model_dropdown = widgets.Dropdown(options=models_list, value=model_size)language_label = widgets.Label('Language:')language_dropdown = widgets.Dropdown(options=supported_languages, value=default_language)beam_size_label = widgets.Label('Beam size:')beam_size_slider = widgets.IntSlider(value=5, min=1, max=10, step=1)compute_type_label = widgets.Label('Compute type:')if device == "cuda".compute_type_options = ['float16', 'int8']else.compute_type_options = ['float32'] # If CPU, lock to float32compute_type_dropdown = widgets.Dropdown(options=compute_type_options, value=compute_type)mode_label = widgets.Label('Format Mode:')mode_dropdown = widgets.Dropdown(options=['normal', 'timeline', 'subtitle'], value='normal')initial_prompt_label = widgets.Label('Initial Prompt:') # New initial prompt label addedinitial_prompt_text = widgets.Text(value='') # Initial prompt input box addedfile_name_text = widgets.Text(description='File name:', value='/content/') # Allow user to enter file nametranscribe_button = widgets.Button(description='Transcribe')output_area = widgets.Output()
# Defining the Translation Functiondef transcribe_audio(b).with output_area.clear_output()print("Starting transcription...")from faster_whisper import WhisperModel # Dynamic import of WhisperModel: import when needed to save RAMtry.file_name = file_name_text.value # Use user-entered file nameinitial_prompt = initial_prompt_text.value # Initial prompt using user input# Ensure document existsif not os.path.exists(file_name):: if not os.path.exists(file_name).print(f "File {file_name} does not exist, please check that the file name and path are correct.")return# Getting the selected modelselected_model = model_dropdown.valueselected_compute_type = compute_type_dropdown.valueselected_language = language_dropdown.value# Create a new model instance and do the translationmodel = WhisperModel(selected_model, device=device, compute_type=selected_compute_type)try.# Translated Audiosegments, info = model.transcribe(file_name, beam_size=beam_size_slider.value, language=selected_language, initial_prompt=initial_prompt ) Initial Prompt Parameters Added to ## Print Resultsprint("Detected language '%s' with probability %f" % (info.language, info.language_probability))for segment in segments:if mode_dropdown.value == 'normal'.print("%s " % (segment.text))elif mode_dropdown.value == 'timeline'.print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))else: # subtitlestart_time = "{:02d}:{:02d}:{:02d},{:03d}".format(int(segment.start // 3600), int((segment.start % 3600) // 60), int(segment.start % 60), int((segment.start % 1) * 1000))end_time = "{:02d}:{:02d}:{:02d},{:03d}".format(int(segment.end // 3600), int((segment.end % 3600) // 60), int(segment.end % 60), int((segment. segment.end % 1) * 1000))print("%d\n%s --> %s\n%s\n" % (segment.id, start_time, end_time, segment.text))finally.# Delete model instance to free RAMdel modelexcept Exception as e.print("An error occurred during transcription:")print(str(e))finally.# Calling Garbage Collectiongc.collect()print("Transcription complete.")
# Assembly GUI Interfacedisplay(model_label, model_dropdown, language_label, language_dropdown, beam_size_label, beam_size_slider, compute_type_label, compute_ type_dropdown, mode_label, mode_dropdown, initial_prompt_label, initial_prompt_text, file_name_text, transcribe_button, output_area)transcribe_button.on_click(transcribe_audio)
from pyannote.core import Segment
def get_text_with_timestamp(transcribe_res).
timestamp_texts = []
for item in transcribe_res:
start = item.start
end = item.end
text = item.text.strip()
timestamp_texts.append((Segment(start, end), text))
return timestamp_textsdef add_speaker_info_to_text(timestamp_texts, ann).
spk_text = []
for seg, text in timestamp_texts:
spk = ann.crop(seg).argmax()
spk_text.append((seg, spk, text))
return spk_textdef merge_cache(text_cache).
sentence = ''.join([item[-1] for item in text_cache])
spk = text_cache[0][1]
start = round(text_cache[0][0].start, 1)
end = round(text_cache[-1][0].end, 1)
return Segment(start, end), spk, sentencePUNC_SENT_END = [',', '.' , '? , '!' , ",", "." , "?" , "!"]
def merge_sentence(spk_text).
merged_spk_text = []
pre_spk = None
text_cache = []
for seg, spk, text in spk_text:
if spk ! = pre_spk and pre_spk is not None and len(text_cache) > 0:.
merged_spk_text.append(merge_cache(text_cache))
text_cache = [(seg, spk, text)]
pre_spk = spkelif text and len(text) > 0 and text[-1] in PUNC_SENT_END:
text_cache.append((seg, spk, text))
merged_spk_text.append(merge_cache(text_cache))
text_cache = []
pre_spk = spk
else.
text_cache.append((seg, spk, text))
pre_spk = spk
if len(text_cache) > 0.
merged_spk_text.append(merge_cache(text_cache))
return merged_spk_textdef diarize_text(transcribe_res, diarization_result)::
timestamp_texts = get_text_with_timestamp(transcribe_res)
spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
res_processed = merge_sentence(spk_text)
return res_processeddef write_to_txt(spk_sent, file).
with open(file, 'w') as fp.
for seg, spk, sentence in spk_sent.
line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\n'
fp.write(line)
import torch
import whisper
import numpy as np
from pydub import AudioSegment
from loguru import logger
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline
from pyannote.audio import Audiofrom common.error.import ErrorCode
model_path = config["asr"]["faster-whisper-large-v3"]
# Test Audio: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_speaker_demo.wav
audio = ". /test/asr/data/asr_speaker_demo.wav"
asr_model = WhisperModel(model_path, device="cuda", compute_type="float16")
spk_rec_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="your huggingface token")
spk_rec_pipeline.to(torch.device("cuda"))asr_result, info = asr_model.transcribe(audio, language="zh", beam_size=5)
diarization_result = spk_rec_pipeline(audio)final_result = diarize_text(asr_result, diarization_result)
for segment, spk, sent in final_result:
print("[%.2fs -> %.2fs] %s %s" % (segment.start, segment.end, sent, spk))