Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@ See the note section for difference between this repo and the the paper.
- Tensorflow-gpu 1.5.0
- PyWorld
- librosa
- soundfile
<br/>


### Note:
1. Be sure to use create a virtual environment (using `conda` or `virtualenv`)
2. If your Tensorflow is the CPU version, you might have to replace all the `NCHW` ops in my code because Tensorflow-CPU only supports `NHWC` op and will report an error: `InvalidArgumentError (see above for traceback): Conv2DCustomBackpropInputOp only supports NHWC.`
3. `soundfile` might require `sudo apt-get install` some codecs.

<br/>
<br/>

Expand All @@ -34,12 +33,12 @@ pip install -r requirements.txt

# feature extraction
python analyzer.py \
--dir_to_wav dataset/vcc2016/wav \
--dir_to_bin dataset/vcc2016/bin
--dir_to_wav /fastdata/ac1zy/data/vcc2016/wav \
--dir_to_bin /fastdata/ac1zy/data/vcc2016/bin

# collect stats
python build.py \
--train_file_pattern "dataset/vcc2016/bin/Training Set/*/*.bin" \
--train_file_pattern "/fastdata/ac1zy/data/vcc2016/bin/training/*/*.bin" \
--corpus_name vcc2016

# training
Expand All @@ -53,7 +52,7 @@ python convert-vawgan.py \
--module model.vawgan \
--model VAWGAN \
--checkpoint logdir/train/[timestampe]/model.ckpt-[modelid] \
--file_pattern "./dataset/vcc2016/bin/Testing Set/{}/*.bin"
--file_pattern "/fastdata/ac1zy/data/vcc2016/bin/testing/{}/*.bin"
# Just an example; Please fill in `timestampe` and `model id`.
```

Expand All @@ -62,9 +61,9 @@ Description:
2. Run `analyzer.py` to extract features and write features into binary files. (This takes a few minutes.)
3. Run `build.py` to collect stats, such as spectral extrema and pitch.
4. To train a VAE or VAWGAN, for example, run
5. You can find your models in `./logdir/train/[timestamp]`
5. You can find your models in `/fastdata/ac1zy/data/vcc2016/logdir/train/[timestamp]`
6. To convert the voice, run
7. You can find the converted wav files in `./logdir/output/[timestamp]`
7. You can find the converted wav files in `/fastdata/ac1zy/data/vcc2016/logdir/output/[timestamp]`
8. The VAE in `model.vae` supports multiple speaker training while that in `model.vawgan` does not.
The VAE can be trained and used with the following snippets:

Expand Down
4 changes: 2 additions & 2 deletions analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@


args = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('dir_to_wav', './dataset/vcc2016/wav', 'Dir to *.wav')
tf.app.flags.DEFINE_string('dir_to_bin', './dataset/vcc2016/bin', 'Dir to output *.bin')
tf.app.flags.DEFINE_string('dir_to_wav', '/fastdata/ac1zy/data/vcc2016/wav', 'Dir to *.wav')
tf.app.flags.DEFINE_string('dir_to_bin', '/fastdata/ac1zy/data/vcc2016/bin', 'Dir to output *.bin')
tf.app.flags.DEFINE_integer('fs', 16000, 'Global sampling frequency')
tf.app.flags.DEFINE_float('f0_ceil', 500, 'Global f0 ceiling')

Expand Down
14 changes: 7 additions & 7 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import pyworld as pw
import soundfile as sf
import librosa
import tensorflow as tf
from analyzer import pw2wav, read, read_whole_features

Expand All @@ -12,7 +12,7 @@
)
tf.app.flags.DEFINE_string(
'train_file_pattern',
'./dataset/vcc2016/bin/Training Set/*/*.bin',
'/fastdata/ac1zy/data/vcc2016/bin/training/*/*.bin',
'training dir (to *.bin)'
)

Expand Down Expand Up @@ -76,7 +76,7 @@ def main():

def test():
# ==== Test: batch mixer (conclusion: capacity should be larger to make sure good mixing) ====
x, y = read('./dataset/vcc2016/bin/*/*/1*001.bin', 32, min_after_dequeue=1024, capacity=2048)
x, y = read('/fastdata/ac1zy/data/vcc2016/bin/*/*/1*001.bin', 32, min_after_dequeue=1024, capacity=2048)
sv = tf.train.Supervisor()
with sv.managed_session() as sess:
for _ in range(200):
Expand All @@ -85,23 +85,23 @@ def test():


# ===== Read binary ====
features = read_whole_features('./dataset/vcc2016/bin/Training Set/SF1/*001.bin')
features = read_whole_features('/fastdata/ac1zy/data/vcc2016/bin/training/SF1/*001.bin')

sv = tf.train.Supervisor()
with sv.managed_session() as sess:
features = sess.run(features)

y = pw2wav(features)
sf.write('test1.wav', y, 16000) # TODO fs should be specified externally.
librosa.output.write_wav('test1.wav', y, 16000) # TODO fs should be specified externally.


# ==== Direct read =====
f = './dataset/vcc2016/bin/Training Set/SF1/100001.bin'
f = '/fastdata/ac1zy/data/vcc2016/bin/training/SF1/100001.bin'
features = np.fromfile(f, np.float32)
features = np.reshape(features, [-1, 513*2 + 1 + 1 + 1]) # f0, en, spk

y = pw2wav(features)
sf.write('test2.wav', y, 16000)
librosa.output.write_wav('test2.wav', y, 16000)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion util/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def get_default_logdir(logdir_root):
'You can only specify either --logdir or --logdir_root')

if args.logdir_root is None:
logdir_root = 'logdir'
logdir_root = '/fastdata/ac1zy/data/vcc2016/logdir'

if args.logdir is None:
logdir = get_default_logdir(logdir_root)
Expand Down