GenVC/train_vocoder.py at main · caizexin/GenVC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from configs.genVC_train_configs import GPTArgs, genVCAudioConfig, GPTTrainerConfig
from trainers.hifigan_trainer import HiFiGANTrainer
from configs.vocoder_configs import BaseVocoderConfig
from configs.vae_config import VAEConfig
from configs.base_configs import BaseAudioConfig
from trainer import Trainer, TrainerArgs

MEL_NORM_FILE = 'pre_trained/mel_stats.pth'
DVAE_CHECKPOINT = 'pre_trained/acoustic_dvae.pth'
CONTENT_DVAE_CHECKPOINT = 'pre_trained/content_dvae.pth'
CONTENTVEC_MODEL_PATH = 'pre_trained/contentVec.pt'
GPT_CHECKPOINT = 'pre_trained/gpt.pth'
VOCODER_CHECKPOINT = None

# copy the config from train_audio_dvae.py
acousticDVAE_audio_config = BaseAudioConfig(dvae_sample_rate=24000)
acousticDVAE_config = VAEConfig(
    audio=acousticDVAE_audio_config,
    mel_norm_file=MEL_NORM_FILE,
    num_channels=80,
    num_tokens=1024,
    codebook_dim=512,
    hidden_dim=512,
    num_resnet_blocks=3,
    kernel_size=3,
    num_layers=2,
)

# copy the config from train_content_dvae.py
contentDVAE_audio_config = BaseAudioConfig(dvae_sample_rate=16000)
contentDVAE_config = VAEConfig(
    audio=contentDVAE_audio_config,
    mel_norm_file=MEL_NORM_FILE,
    num_channels=256,
    num_tokens=256,
    codebook_dim=512,
    hidden_dim=512,
    num_resnet_blocks=3,
    kernel_size=3,
    num_layers=2,
)

model_args = GPTArgs(
    mel_norm_file=MEL_NORM_FILE,
    gpt_num_audio_tokens=1026,
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_start_text_token=256,
    gpt_stop_text_token=257,
    gpt_number_text_tokens=258,
    gpt_fix_condition_embeddings=True,
    gpt_use_masking_gt_prompt_approach=True,
    min_text_length=8, # 8 tokens = 0.64 seconds for 20ms frame rate,
    max_text_length=8,
    gpt_n_heads=4,
    gpt_checkpoint=GPT_CHECKPOINT,
    hifigan_checkpoint=VOCODER_CHECKPOINT,
)

audio_config = genVCAudioConfig()
vocoder_config = BaseVocoderConfig()

config = GPTTrainerConfig(
    contentvec_model_path=CONTENTVEC_MODEL_PATH,
    acoustic_dvae_checkpoint=DVAE_CHECKPOINT,
    content_dvae_checkpoint=CONTENT_DVAE_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
    model_args=model_args,
    audio=audio_config,
    content_dvae_config=contentDVAE_config,
    acoustic_dvae_config=acousticDVAE_config,
    vocoder_config=vocoder_config,
    batch_size=64,
    eval_batch_size=64,
    num_loader_workers=24,
    epochs=50,
    print_step=50,
    plot_step=500,
    log_model_step=100,
    save_step=5000,
    print_eval=False,
    save_n_checkpoints=2,
    save_checkpoints=True,
    run_name="hifi-gan",
    optimizer="AdamW",
    output_path="exp/HiFiGAN_LibriTTS",
    optimizer_wd_only_on_weights=True,
    lr=2e-4,
    optimizer_params={"betas": [0.8, 0.99], "eps": 1e-8, "weight_decay": 1e-6},
    weight_decay=1e-6,
    warmup_steps=1000,
    max_grad_norm=1.0,
    train_metafile='metafiles/libritts/train.txt',
    test_metafile='metafiles/libritts/test.txt',
    use_wandb=True,
    wandb_project='hifi-gan',
    wandb_run_name='libritts',

)

if __name__ == '__main__':
    # currently, we don't support resuming training from a Coqui Vocoder Trainer checkpoint for the HiFi-GAN Trainer
    # please use hifigan_checkpoint to specify the path to the HiFi-GAN checkpoint instead
    restore_path = None
    model = HiFiGANTrainer.init_from_config(config)
    trainer_args = TrainerArgs(restore_path=restore_path)
    trainer = Trainer(
            trainer_args,
            config,
            model=model,
            output_path=config.output_path,
    )

    trainer.fit()