forked from keithito/tacotron
-
Notifications
You must be signed in to change notification settings - Fork 103
/
tacotron.py
142 lines (116 loc) · 6.84 KB
/
tacotron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import tensorflow as tf
from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper
from tensorflow.contrib.seq2seq import BasicDecoder, AttentionWrapper
from text.symbols import symbols
from util.infolog import log
from .attention import LocationSensitiveAttention
from .helpers import TacoTestHelper, TacoTrainingHelper
from .modules import encoder_cbhg, post_cbhg, prenet
from .rnn_wrappers import DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper
class Tacotron():
def __init__(self, hparams):
self._hparams = hparams
def initialize(self, inputs, input_lengths, mel_targets=None, linear_targets=None):
'''Initializes the model for inference.
Sets "mel_outputs", "linear_outputs", and "alignments" fields.
Args:
inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
steps in the input time series, and values are character IDs
input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
of each sequence in inputs.
mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
of steps in the output time series, M is num_mels, and values are entries in the mel
spectrogram. Only needed for training.
linear_targets: float32 Tensor with shape [N, T_out, F] where N is batch_size, T_out is number
of steps in the output time series, F is num_freq, and values are entries in the linear
spectrogram. Only needed for training.
'''
with tf.variable_scope('inference') as scope:
is_training = linear_targets is not None
batch_size = tf.shape(inputs)[0]
hp = self._hparams
# Embeddings
embedding_table = tf.get_variable(
'embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32,
initializer=tf.truncated_normal_initializer(stddev=0.5))
embedded_inputs = tf.nn.embedding_lookup(embedding_table, inputs) # [N, T_in, 256]
# Encoder
prenet_outputs = prenet(embedded_inputs, is_training) # [N, T_in, 128]
encoder_outputs = encoder_cbhg(prenet_outputs, input_lengths, is_training) # [N, T_in, 256]
# Attention
attention_cell = AttentionWrapper(
DecoderPrenetWrapper(GRUCell(256), is_training),
LocationSensitiveAttention(256, encoder_outputs),
alignment_history=True,
output_attention=False) # [N, T_in, 256]
# Concatenate attention context vector and RNN cell output into a 512D vector.
concat_cell = ConcatOutputAndAttentionWrapper(attention_cell) # [N, T_in, 512]
# Decoder (layers specified bottom to top):
decoder_cell = MultiRNNCell([
OutputProjectionWrapper(concat_cell, 256),
ResidualWrapper(GRUCell(256)),
ResidualWrapper(GRUCell(256))
], state_is_tuple=True) # [N, T_in, 256]
# Project onto r mel spectrograms (predict r outputs at each RNN step):
output_cell = OutputProjectionWrapper(decoder_cell, hp.num_mels * hp.outputs_per_step)
decoder_init_state = output_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
if is_training:
helper = TacoTrainingHelper(inputs, mel_targets, hp.num_mels, hp.outputs_per_step)
else:
helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step)
(decoder_outputs, _), final_decoder_state, _ = tf.contrib.seq2seq.dynamic_decode(
BasicDecoder(output_cell, helper, decoder_init_state),
maximum_iterations=hp.max_iters) # [N, T_out/r, M*r]
# Reshape outputs to be one output per entry
mel_outputs = tf.reshape(decoder_outputs, [batch_size, -1, hp.num_mels]) # [N, T_out, M]
# Add post-processing CBHG:
post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) # [N, T_out, 256]
linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F]
# Grab alignments from the final decoder state:
alignments = tf.transpose(final_decoder_state[0].alignment_history.stack(), [1, 2, 0])
self.inputs = inputs
self.input_lengths = input_lengths
self.mel_outputs = mel_outputs
self.linear_outputs = linear_outputs
self.alignments = alignments
self.mel_targets = mel_targets
self.linear_targets = linear_targets
log('Initialized Tacotron model. Dimensions: ')
log(' embedding: %d' % embedded_inputs.shape[-1])
log(' prenet out: %d' % prenet_outputs.shape[-1])
log(' encoder out: %d' % encoder_outputs.shape[-1])
log(' attention out: %d' % attention_cell.output_size)
log(' concat attn & out: %d' % concat_cell.output_size)
log(' decoder cell out: %d' % decoder_cell.output_size)
log(' decoder out (%d frames): %d' % (hp.outputs_per_step, decoder_outputs.shape[-1]))
log(' decoder out (1 frame): %d' % mel_outputs.shape[-1])
log(' postnet out: %d' % post_outputs.shape[-1])
log(' linear out: %d' % linear_outputs.shape[-1])
def add_loss(self):
'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
with tf.variable_scope('loss') as scope:
hp = self._hparams
self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs))
l1 = tf.abs(self.linear_targets - self.linear_outputs)
# Prioritize loss for frequencies under 3000 Hz.
n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq)
self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq])
self.loss = self.mel_loss + self.linear_loss
def add_optimizer(self, global_step):
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
Args:
global_step: int32 scalar Tensor representing current global step in training
'''
with tf.variable_scope('optimizer') as scope:
hp = self._hparams
self.learning_rate = tf.train.exponential_decay(
hp.initial_learning_rate, global_step, hp.learning_rate_decay_halflife, 0.5)
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
gradients, variables = zip(*optimizer.compute_gradients(self.loss))
self.gradients = gradients
clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
# https://github.com/tensorflow/tensorflow/issues/1122
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
global_step=global_step)