diff --git a/preprocessor/aishell3.py b/preprocessor/aishell3.py index 27b92aab12..e32607b6e9 100644 --- a/preprocessor/aishell3.py +++ b/preprocessor/aishell3.py @@ -2,7 +2,7 @@ import librosa import numpy as np -from scipy.io import wavfile +import soundfile as sf from tqdm import tqdm @@ -10,7 +10,7 @@ def prepare_align(config): in_dir = config["path"]["corpus_path"] out_dir = config["path"]["raw_path"] sampling_rate = config["preprocessing"]["audio"]["sampling_rate"] - max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] + # max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] for dataset in ["train", "test"]: print("Processing {}ing set...".format(dataset)) with open(os.path.join(in_dir, dataset, "content.txt"), encoding="utf-8") as f: @@ -22,11 +22,12 @@ def prepare_align(config): if os.path.exists(wav_path): os.makedirs(os.path.join(out_dir, speaker), exist_ok=True) wav, _ = librosa.load(wav_path, sampling_rate) - wav = wav / max(abs(wav)) * max_wav_value - wavfile.write( + wav = wav / max(abs(wav)) # * max_wav_value + sf.write( os.path.join(out_dir, speaker, wav_name), + wav, sampling_rate, - wav.astype(np.int16), + subtype='PCM_16' ) with open( os.path.join(out_dir, speaker, "{}.lab".format(wav_name[:11])), diff --git a/preprocessor/libritts.py b/preprocessor/libritts.py index a90fcdee5d..6d13d30f4e 100644 --- a/preprocessor/libritts.py +++ b/preprocessor/libritts.py @@ -2,7 +2,7 @@ import librosa import numpy as np -from scipy.io import wavfile +import soundfile as sf from tqdm import tqdm from text import _clean_text @@ -12,7 +12,7 @@ def prepare_align(config): in_dir = config["path"]["corpus_path"] out_dir = config["path"]["raw_path"] sampling_rate = config["preprocessing"]["audio"]["sampling_rate"] - max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] + # max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] cleaners = config["preprocessing"]["text"]["text_cleaners"] for speaker in tqdm(os.listdir(in_dir)): for chapter in os.listdir(os.path.join(in_dir, speaker)): @@ -32,11 +32,12 @@ def prepare_align(config): os.makedirs(os.path.join(out_dir, speaker), exist_ok=True) wav, _ = librosa.load(wav_path, sampling_rate) - wav = wav / max(abs(wav)) * max_wav_value - wavfile.write( + wav = wav / max(abs(wav)) # * max_wav_value + sf.write( os.path.join(out_dir, speaker, "{}.wav".format(base_name)), + wav, sampling_rate, - wav.astype(np.int16), + subtype='PCM_16' ) with open( os.path.join(out_dir, speaker, "{}.lab".format(base_name)), diff --git a/preprocessor/ljspeech.py b/preprocessor/ljspeech.py index a2f664873a..7803770fb7 100644 --- a/preprocessor/ljspeech.py +++ b/preprocessor/ljspeech.py @@ -2,7 +2,7 @@ import librosa import numpy as np -from scipy.io import wavfile +import soundfile as sf from tqdm import tqdm from text import _clean_text @@ -12,7 +12,7 @@ def prepare_align(config): in_dir = config["path"]["corpus_path"] out_dir = config["path"]["raw_path"] sampling_rate = config["preprocessing"]["audio"]["sampling_rate"] - max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] + # max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] cleaners = config["preprocessing"]["text"]["text_cleaners"] speaker = "LJSpeech" with open(os.path.join(in_dir, "metadata.csv"), encoding="utf-8") as f: @@ -26,11 +26,12 @@ def prepare_align(config): if os.path.exists(wav_path): os.makedirs(os.path.join(out_dir, speaker), exist_ok=True) wav, _ = librosa.load(wav_path, sampling_rate) - wav = wav / max(abs(wav)) * max_wav_value - wavfile.write( + wav = wav / max(abs(wav)) # * max_wav_value + sf.write( os.path.join(out_dir, speaker, "{}.wav".format(base_name)), + wav, sampling_rate, - wav.astype(np.int16), + subtype='PCM_16' ) with open( os.path.join(out_dir, speaker, "{}.lab".format(base_name)),