Hi Thorsten, you should take a look at Retrieval-based-Voice-Conversion-WebUI, it's a STS tool though can be adopted to TTS and it trains a voice model much faster - in several minutes!
Hello Thorsten, have you had time to implemented this: "textCleaned": text.lower() # TODO: Add textcleaner library (multilanguage support) in your script?
@@ThorstenMueller i have no programmer skill, but this is what i have modified on your script to use GPU and polish dataset: from pydub import AudioSegment from pydub.silence import split_on_silence import pandas as pd import os import glob import whisper import torch import cleantext # import text cleaning library # Initialize the Whisper model model = whisper.load_model("large-v3") device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model.to(device) # Directory containing the input WAV files input_dir = "./data_to_LJSpeech" # Output directory output_dir = "output" audio_dir = os.path.join(output_dir, "audio") if not os.path.exists(audio_dir): os.makedirs(audio_dir) metadata = [] # Parameters for silence-based splitting min_silence_len = 500 # minimum length of silence (in ms) to be used for a split silence_thresh = None # will be calculated for each file keep_silence = 200 # amount of silence (in ms) to leave at the beginning and end of each chunk # Get the list of all WAV files in the directory wav_files = sorted(glob.glob(os.path.join(input_dir, "*.wav"))) total_files = len(wav_files) # Total number of files to process for idx, wav_file in enumerate(wav_files, start=1): # Load audio file print(f"--> Processing file {idx}/{total_files}: {wav_file}") audio = AudioSegment.from_wav(wav_file)
# Calculate silence threshold for the current file if silence_thresh is None: silence_thresh = audio.dBFS - 14
# Split the audio into chunks based on silence audio_chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence) # Transcribe each chunk and save with metadata for i, chunk in enumerate(audio_chunks): # Export chunk as temporary wav file chunk_path = os.path.join(output_dir, f"chunk_{i}.wav") chunk.export(chunk_path, format="wav")
# Transcribe chunk in Polish language result = model.transcribe(chunk_path, language="pl") # set language to Polish
# Get the transcribed text text = result['text'].strip()
# Save chunk with unique ID sentence_id = f"LJ{str(len(metadata) + 1).zfill(4)}" sentence_path = os.path.join(audio_dir, f"{sentence_id}.wav") chunk.export(sentence_path, format="wav")
# Add metadata, including cleaned text metadata.append({ "ID": sentence_id, "text": text, "textCleaned": cleantext.clean(text, extra_spaces=True, lowercase=True) # text cleaning }) # Remove temporary chunk file os.remove(chunk_path) # Create metadata.csv file with audio file IDs and corresponding sentences metadata_df = pd.DataFrame(metadata) metadata_csv_path = os.path.join(output_dir, "metadata.csv") metadata_df.to_csv(metadata_csv_path, sep="|", header=False, index=False) print(f"Processed {len(metadata)} sentences.") print(f"CSV file saved at {metadata_csv_path}") and i think i would be good to save each transcribe wav file to csv because last night my PC crashed and i lost almost 3k transcribtion lines :(
@@ThorstenMueller this is what i used for my polish dataset from pydub import audio segment from pydub.silence import split_on_silence import pandas as pd import os import globe import whisper import torch import plaintext # import library to plaintext # Initialize Whisper model model = whisper.load_model("large-v3") device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model.to(device) # Directory with basic WAV files input_dir = "./data_to_LJSpeech" # Output directory output_dir = "output" audio_dir = os.path.join(output_dir, "audio") if not os.path.exists(audio_dir): os.makedirs(audio_dir) metadata = [] # Parameters to distribute on silence min_silence_len = 500 # minimum length of silence (in ms) to use for branches silence_thresh = None # will be calculated for each file keep_silence = 200 # amount of silence (in ms) to specify at the beginning and end of each fragment # Fetching lists of all WAV files in a directory wav_files = sorted(glob.glob(os.path.join(input_dir, "*.wav"))) total_files = len(wav_files) # Total number of files to process for idx, wav file in enumerate(wav_files, start=1): # Loading audio file print(f"--> Processing file {idx}/{total_files}: {wav_file}") audio = AudioSegment.from_wav(wav_file) # Calculating silence threshold for file if silence_thresh is None: silence_thresh = audio.dBFS - 14 # Splitting audio based on silence justification audio_chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence) # Transcription of each fragment and saving with metadata for me fragment in enumerate(audio_chunks): # Exporting fragment as temporary WAV file chunk_path = os.path.join(output_dir, f"chunk_{i}.wav") chunk.export(chunk_path, format="wav") # Transcribe fragments in Polish result = model.transcribe(chunk_path, language="pl") # set language to Polish # Download transcribed text text = result['text'].strip() # Save fragment from ID storage sentence_id = f"LJ{str(len(metadata) + 1).zfill(4)}" sentence_path = os.path.join(audio_dir, f"{sentence_id}.wav") chunk.export(sentence_path, format="wav") # Add metadata, including cleaned text metadata.append({ "ID": sentence_id, "text": text, "textCleaned": cleantext.clean(text, extra_spaces=True, smallcase=True) # text cleanup }) # Remove temporary chunk file os.remove(chunk_path) # Create metadata.csv file with audio file ids and split permissions metadata_df = pd.DataFrame(metadata) metadata_csv_path = os.path.join(output_dir, "metadata.csv") metadata_df.to_csv(metadata_csv_path, sep="|", header=false, index=false) print(f"{len(metadata)} sentences processed.") print(f"CSV file saved to {metadata_csv_path}") and i think i would be good to save each transcribe wav file to csv because last night my PC crashed and i lost almost 3k transcribed files.... :(
Thank you very much for the script! Running it creates files mainly as long as the originals. Is there a variable I didn't see to create files as long as 2-12s?
@@ŁukaszMadajczyk No, i don't know a special open voice dataset for polish to train a model on. Maybe you can use a polish voice from Mozilla Common Voice project (if that's allowed?!) but this might be difficult due the quality of audio recordings.
Hello sir, I'm new subscriber, could you make a video about forked Coqui TTS by idiap/coqui-ai-TTS, it's saying this fork supported to use Fairseq models by Meta that's support 1100 languages.
Guude! Every one of your t-shirt designs always puts a smile on my digital face.
Guude and thank you, dear Louis 😊. I love wearing these type of shirts. Keep up your great work on Leon AI 👏🏻.
I absolutely love your videos Thorsten, thank you for a new video for the community! ❤
Wow, thanks for your amazing feedback 😊. I am always thankful if people find my videos useful.
Informative content. Thank you 😊
Hi Thorsten, you should take a look at Retrieval-based-Voice-Conversion-WebUI, it's a STS tool though can be adopted to TTS and it trains a voice model much faster - in several minutes!
Thanks for your topic suggestion 😊. I've put it on my (growing) TODO list :-).
Informative. Thx and like
Hello Thorsten,
have you had time to implemented this:
"textCleaned": text.lower() # TODO: Add textcleaner library (multilanguage support)
in your script?
No, not yet 🙃.
If you have a good idea feel free to send a pull request.
@@ThorstenMueller
i have no programmer skill, but this is what i have modified on your script to use GPU and polish dataset:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import pandas as pd
import os
import glob
import whisper
import torch
import cleantext # import text cleaning library
# Initialize the Whisper model
model = whisper.load_model("large-v3")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model.to(device)
# Directory containing the input WAV files
input_dir = "./data_to_LJSpeech"
# Output directory
output_dir = "output"
audio_dir = os.path.join(output_dir, "audio")
if not os.path.exists(audio_dir):
os.makedirs(audio_dir)
metadata = []
# Parameters for silence-based splitting
min_silence_len = 500 # minimum length of silence (in ms) to be used for a split
silence_thresh = None # will be calculated for each file
keep_silence = 200 # amount of silence (in ms) to leave at the beginning and end of each chunk
# Get the list of all WAV files in the directory
wav_files = sorted(glob.glob(os.path.join(input_dir, "*.wav")))
total_files = len(wav_files) # Total number of files to process
for idx, wav_file in enumerate(wav_files, start=1):
# Load audio file
print(f"--> Processing file {idx}/{total_files}: {wav_file}")
audio = AudioSegment.from_wav(wav_file)
# Calculate silence threshold for the current file
if silence_thresh is None:
silence_thresh = audio.dBFS - 14
# Split the audio into chunks based on silence
audio_chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence)
# Transcribe each chunk and save with metadata
for i, chunk in enumerate(audio_chunks):
# Export chunk as temporary wav file
chunk_path = os.path.join(output_dir, f"chunk_{i}.wav")
chunk.export(chunk_path, format="wav")
# Transcribe chunk in Polish language
result = model.transcribe(chunk_path, language="pl") # set language to Polish
# Get the transcribed text
text = result['text'].strip()
# Save chunk with unique ID
sentence_id = f"LJ{str(len(metadata) + 1).zfill(4)}"
sentence_path = os.path.join(audio_dir, f"{sentence_id}.wav")
chunk.export(sentence_path, format="wav")
# Add metadata, including cleaned text
metadata.append({
"ID": sentence_id,
"text": text,
"textCleaned": cleantext.clean(text, extra_spaces=True, lowercase=True) # text cleaning
})
# Remove temporary chunk file
os.remove(chunk_path)
# Create metadata.csv file with audio file IDs and corresponding sentences
metadata_df = pd.DataFrame(metadata)
metadata_csv_path = os.path.join(output_dir, "metadata.csv")
metadata_df.to_csv(metadata_csv_path, sep="|", header=False, index=False)
print(f"Processed {len(metadata)} sentences.")
print(f"CSV file saved at {metadata_csv_path}")
and i think i would be good to save each transcribe wav file to csv because last night my PC crashed and i lost almost 3k transcribtion lines :(
@@ThorstenMueller
this is what i used for my polish dataset
from pydub import audio segment
from pydub.silence import split_on_silence
import pandas as pd
import os
import globe
import whisper
import torch
import plaintext # import library to plaintext
# Initialize Whisper model
model = whisper.load_model("large-v3")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model.to(device)
# Directory with basic WAV files
input_dir = "./data_to_LJSpeech"
# Output directory
output_dir = "output"
audio_dir = os.path.join(output_dir, "audio")
if not os.path.exists(audio_dir):
os.makedirs(audio_dir)
metadata = []
# Parameters to distribute on silence
min_silence_len = 500 # minimum length of silence (in ms) to use for branches
silence_thresh = None # will be calculated for each file
keep_silence = 200 # amount of silence (in ms) to specify at the beginning and end of each fragment
# Fetching lists of all WAV files in a directory
wav_files = sorted(glob.glob(os.path.join(input_dir, "*.wav")))
total_files = len(wav_files) # Total number of files to process
for idx, wav file in enumerate(wav_files, start=1):
# Loading audio file
print(f"--> Processing file {idx}/{total_files}: {wav_file}")
audio = AudioSegment.from_wav(wav_file)
# Calculating silence threshold for file
if silence_thresh is None:
silence_thresh = audio.dBFS - 14
# Splitting audio based on silence justification
audio_chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence)
# Transcription of each fragment and saving with metadata
for me fragment in enumerate(audio_chunks):
# Exporting fragment as temporary WAV file
chunk_path = os.path.join(output_dir, f"chunk_{i}.wav")
chunk.export(chunk_path, format="wav")
# Transcribe fragments in Polish
result = model.transcribe(chunk_path, language="pl") # set language to Polish
# Download transcribed text
text = result['text'].strip()
# Save fragment from ID storage
sentence_id = f"LJ{str(len(metadata) + 1).zfill(4)}"
sentence_path = os.path.join(audio_dir, f"{sentence_id}.wav")
chunk.export(sentence_path, format="wav")
# Add metadata, including cleaned text
metadata.append({
"ID": sentence_id,
"text": text,
"textCleaned": cleantext.clean(text, extra_spaces=True, smallcase=True) # text cleanup
})
# Remove temporary chunk file
os.remove(chunk_path)
# Create metadata.csv file with audio file ids and split permissions
metadata_df = pd.DataFrame(metadata)
metadata_csv_path = os.path.join(output_dir, "metadata.csv")
metadata_df.to_csv(metadata_csv_path, sep="|", header=false, index=false)
print(f"{len(metadata)} sentences processed.")
print(f"CSV file saved to {metadata_csv_path}")
and i think i would be good to save each transcribe wav file to csv because last night my PC crashed and i lost almost 3k transcribed files.... :(
Thank you very much for the script!
Running it creates files mainly as long as the originals.
Is there a variable I didn't see to create files as long as 2-12s?
Have you know where to find dataset for Polish language to trainer model for piper?
Your videos were inspired me to do high quality model for my needs
You mean a ready to use public Polish voice dataset or just public phrases that can be used for recording?
Dataset for polish language to train high model
@@ŁukaszMadajczyk No, i don't know a special open voice dataset for polish to train a model on. Maybe you can use a polish voice from Mozilla Common Voice project (if that's allowed?!) but this might be difficult due the quality of audio recordings.
Thanks, can I run on CPU because i don't have GPU
In general - yes. But CPU is (mostly) way slower than GPU.
💪
Vielen Dank, lieber Benno 😊.
Hello sir, I'm new subscriber, could you make a video about forked Coqui TTS by idiap/coqui-ai-TTS, it's saying this fork supported to use Fairseq models by Meta that's support 1100 languages.
Thanks for joining community 😊. I added your topic suggestion on my todo list, but it might take some time as the list is constantly growing 😉.
@@ThorstenMueller thank you, I really appreciate that!