-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathdefault.yaml
77 lines (77 loc) · 3.36 KB
/
default.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
data:
datasets_path: './datasets'
dataset: 'ljspeech'
dataset_dir: 'LJSpeech-1.1'
text:
graphemes: &gs !!python/object/apply:eval ['list("abcdefghijklmnopqrstuvwxyz")']
phonemes: &ps !!python/object/apply:eval ['["AA0","AA1","AA2","AE0","AE1","AE2","AH0","AH1","AH2","AO0","AO1","AO2","AW0","AW1","AW2","AY0","AY1","AY2","B","CH","D","DH","EH0","EH1","EH2","ER0","ER1","ER2","EY0","EY1","EY2","F","G","HH","IH0","IH1","IH2","IY0","IY1","IY2","JH","K","L","M","N","NG","OW0","OW1","OW2","OY0","OY1","OY2","P","R","S","SH","T","TH","UH0","UH1","UH2","UW","UW0","UW1","UW2","V","W","Y","Z","ZH"]']
specials: &sp !!python/object/apply:eval ['["<pad>", "<unk>"]']
punctuations: &pt !!python/object/apply:eval ['[".", ",", "?", "!", " ", "-"]']
units_list: &ul !!python/object/apply:eval ['us+sp+pt', {'us': *ps, 'sp': *sp, 'pt': *pt}]
use_phonemes: &up true
audio:
n_mel_channels: &nm 80
filter_length: 1024
hop_length: 256 # WARNING: this can't be changed.
win_length: 1024
sampling_rate: &sr 22050
segment_length: *sr
pad_short: 2000
mel_fmin: 0.0
mel_fmax: 8000.0
# Precomputed statistics for log-mel-spectrs for speech dataset
spec_mean: -5.522 # for LJSpeech dataset
spec_std: 2.063 # for LJSpeech dataset
spec_min: -11.5129 # for LJSpeech dataset
spec_max: 2.0584 # for LJSpeech dataset
# Others
force_frame_rate: true # force match sampling rate
normalize:
match_volume: false
trim_silence: false
reduction_rate: 4
parallel:
ground_truth: false
out_channels: *nm # equal to ${audio.n_mel_channels}
alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}]
channels: 128
enc_kernel_size: 4
dec_kernel_size: 4
enc_dilations: !!python/object/apply:eval ['4 * [1,2,4] + [1]'] # receptive field is max 15
dec_dilations: !!python/object/apply:eval ['4 * [1,2,4,8] + [1]'] # receptive field is max 32
normalize: FreqNorm # 'freq', 'layer', 'batch'
activation: torch.nn.ReLU # 'relu', 'linear', 'sigmoid'
final_activation: torch.nn.Identity
pos_mode: 'duration' # 'standard', 'duration'
interpolate: false # true
separate_duration_grad: true
speaker_dim: 128 # speaker embedding dim
duration:
out_channels: *nm # equal to ${audio.n_mel_channels}
alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}]
channels: 40
hidden_channels: 80
kernel_size: 3
text_enc_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
spec_enc_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
spec_dec_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
normalize: torch.nn.Linear # 'layer' # 'freq', 'layer', 'batch'
activation: torch.nn.ReLU # 'relu', 'linear', 'sigmoid'
final_activation: torch.nn.Sigmoid
att_noise: 0.1
att_hidden_channels: 80
pos_mode: 'duration' # 'standard', 'duration'
# Spectrogram augmentation options
enable_augment: true
noise: 0.01 # 1. add normal noise to input spectrs
feed_repeat: 2 # 2. Feed spectrograms through the model `feed_repeat` times, use degraded output on input for training
feed_ratio: 0.5 # how many items in batch are degraded
replace_ratio: 0.1 # 3. Replace random spectrogram frames with random noise
melgan:
checkpoint: 'melgan-epoch3200.pth'
trainer:
disable_progress_bar: false
logdir: './logdir'
synthesizer:
inputs_file_path: './outputs/text/synthesize.txt'
outputs_dir: './outputs/audio'