Voicefilter/encoder_inference.py at master · deciding/Voicefilter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse
from pathlib import Path
import numpy as np
from model.embedder import SpeechEmbedder
import torch
from utils.hparams import HParam
import librosa
from utils.audio import Audio

#python encoder_inference.py --in_dir ../vox1_test/wav/ --out_dir spkid --gpu_str 0 (eval)
#python encoder_inference.py --in_dir ../datasets/raw_libri/LibriSpeech --out_dir '' --gpu_str 0 (generate ls)
#python encoder_inference.py --in_dir training_prepared/train --out_dir '' --gpu_str 0 (for speaker extraction of convtasnet, voicefilter will produce embedding on the fly)
if __name__ == '__main__':
    ## Info & args
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("-e", "--enc_model_fpath", type=Path,
                        default="embedder.pt",
                        help="Path to a saved encoder")
    parser.add_argument('-c', '--config', type=str,
                        default="config/config.yaml",
                        help="yaml file for configuration")
    parser.add_argument("--in_dir", type=str, required=True, help="input data(pickle) dir")
    parser.add_argument("--out_dir", type=str, required=True, help="input data(pickle) dir")
    parser.add_argument('--gpu_str', default='0')
    args = parser.parse_args()
    import os
    os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu_str)

    print("Preparing the encoder...")
    hp = HParam(args.config)
    embedder_pt = torch.load(args.enc_model_fpath)
    embedder = SpeechEmbedder(hp).cuda()
    embedder.load_state_dict(embedder_pt)
    embedder.eval()
    audio = Audio(hp)

    from glob import glob
    from tqdm import tqdm
    #../datasets/raw_libri/librispeech/train-clean-360/8194/89390/8194-89390-0041-norm.wav
    #txt_list=glob('%s/*.txt' % args.in_dir, recursive=True)
    #for txt_file in tqdm(txt_list):
    #    with open(txt_file) as f:
    #        wav_file=f.readline().strip()

    #wav_list=glob('%s/**/*.wav' % args.in_dir, recursive=True)
    wav_list=glob('%s/*dvec.wav' % args.in_dir, recursive=True)#convtasnet
    #wav_list=glob('%s/*dvec3.wav' % args.in_dir, recursive=True)#convtasnet
    #wav_list=[wavfile for wavfile in wav_list if int(os.path.basename(wavfile).split('-')[0]) >=70000]
    for wav_file in tqdm(wav_list):
        #preprocessed_wav = encoder.preprocess_wav(wav_file)
        #norm_mean_dvector= encoder.embed_utterance(preprocessed_wav)
        dvec_wav, _ = librosa.load(wav_file, sr=hp.audio.sample_rate)
        dvec_mel = audio.get_mel(dvec_wav)
        dvec_mel = torch.from_numpy(dvec_mel).float().cuda()
        norm_mean_dvector = embedder(dvec_mel)
        ##filename='%s.npy' % os.path.basename(txt_file.replace('.txt',''))
        #filename='%s.npy' % os.path.basename(wav_file.replace('.wav',''))
        ##spk_dir=args.in_dir
        #file_parts=wav_file.split('/')
        #spkid=file_parts[-3]
        #clipid=file_parts[-2]
        #spk_dir="%s/%s/%s" % (args.out_dir, spkid, clipid)
        #if not os.path.exists(spk_dir):
        #    os.makedirs(spk_dir)
        #npy_save_path='%s/%s' % (spk_dir, filename)

        #convtasnet
        filename='%s.npy' % os.path.basename(wav_file).replace('.wav', '')
        npy_save_path='%s/%s' % (args.in_dir, filename)
        np.save(npy_save_path, norm_mean_dvector.detach().cpu().numpy())