-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathserver.py
166 lines (153 loc) · 5.59 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import socket
import os
import sys
import numpy as np
import time
from model.models import ForwardTransformer
from utils.training_config_manager import TrainingConfigManager
from data.audio import Audio
from scipy.io import wavfile
from generator import Generator
import argparse
import time as t
import torch
import tensorflow as tf
# Control GPU Memory : gpus[number], memory_limit=
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=16000)])
def socket_service(FFT_model, AR_model, vocoder):
try:
socket.setdefaulttimeout(20)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind(('***.***.**.***', 4000)) # Enter your IP
except ConnectionResetError:
print("==> ConnectionResetError")
pass
except socket.error as msg:
print(msg)
sys.exit(1)
print ('Socket bind complete')
s.listen(10)
print ('Socket now listening')
print("Wait for Connection.....................")
while True:
sock, addr = s.accept() #addr是一个元组(ip,port)
print("Accept connection from {}".format(addr)) #查看发送端的ip和端口
text = str(sock.recv(2000), encoding="utf-8")
if not text:
pass
else:
sp = float(str(sock.recv(4),encoding="utf-8"))
model = str(sock.recv(100), encoding="utf-8")
text = text_normalization(text)
if model == "FFT":
FFT_synthesis(text, FFT_model, vocoder, sp)
with open("./FFT_melgan.wav", 'rb') as f:
for data in f:
sock.send(data)
elif model == "TransformerTTS":
AR_synthesis(text, AR_model, vocoder)
with open("./AR_melgan.wav", 'rb') as f:
for data in f:
sock.send(data)
sock.close()
def text_normalization(text):
temp = []
for i in text:
temp.append(i)
text = ""
for i in range(len(temp)):
if temp[i] == "奇":
if temp[i+1] == "數":
temp[i] = "雞"
elif temp[i+1] == "偶":
if temp[i+2] == "數":
temp[i] = "雞"
if temp[i] == "澄":
temp[i]= "成"
if temp[i] == "放":
if temp[i+1] == "假":
temp[i+1] = "價"
if temp[i] == "幾":
if temp[i+1] == "乎":
temp[i] = "雞"
if temp[i] == "暖":
if temp[i+1] == "和":
temp[i+1] = "活"
if temp[i] == "著":
if temp[i+1] == "急":
temp[i] = "昭"
elif temp[i+1] == "名" or temp[i+1] == "作":
temp[i] = "住"
if temp[i] == "音":
if temp[i+1] == "樂":
temp[i+1] = "月"
if temp[i] == "樂":
if temp[i+1] == "器":
temp[i] = "月"
#---------------------------------------------------
if temp[i] == "0":
temp[i] = "零"
if temp[i] == "1":
temp[i] = "一"
if temp[i] == "2":
temp[i] = "二"
if temp[i] == "3":
temp[i] = "三"
if temp[i] == "4":
temp[i] = "四"
if temp[i] == "5":
temp[i] = "五"
if temp[i] == "6":
temp[i] = "六"
if temp[i] == "7":
temp[i] = "七"
if temp[i] == "8":
temp[i] = "八"
if temp[i] == "9":
temp[i] = "九"
for i in range(len(temp)):
text = text + temp[i]
return text
def FFT_synthesis(text, FFT_model, vocoder, sp):
print(text, sp, FFT_model)
print("----------------------start----------------------")
start = t.time()
FFT_out = FFT_model.predict(text, speed_regulator = sp)
FFT_mel = torch.tensor(FFT_out['mel'].numpy().T[np.newaxis,:,:])
if torch.cuda.is_available():
vocoder = vocoder.cuda()
FFT_mel = FFT_mel.cuda()
with torch.no_grad():
FFT_speech = vocoder.inference(FFT_mel)
wavfile.write("./FFT_melgan.wav", 22050, FFT_speech.cpu().numpy())
ete = t.time()
print((ete - start), "s for FFT End-to-End synthesis", FFT_out["mel"].shape[0], "length")
def AR_synthesis(text, AR_model, vocoder):
print(text, AR_model)
print("----------------------start----------------------")
start = t.time()
AR_out = AR_model.predict(text)
AR_mel = torch.tensor(AR_out['mel'].numpy().T[np.newaxis,:,:])
if torch.cuda.is_available():
vocoder = vocoder.cuda()
AR_mel = AR_mel.cuda()
with torch.no_grad():
AR_speech = vocoder.inference(AR_mel)
wavfile.write("./AR_melgan.wav", 22050, AR_speech.cpu().numpy())
ete = t.time()
print(ete - start, "s for AR End-to-End synthesis", AR_out["mel"].shape[0], "length")
if __name__=="__main__":
#load model
FFT_model = ForwardTransformer.load_model('./logdir/bznsyp/tts_swap_conv_dims.alinger_extralayer_layernorm/weights/step_85000/')
config_loader = TrainingConfigManager(config_path='./config/training_config.yaml', aligner=True)
AR_model = config_loader.load_model()
vocoder = torch.hub.load('../melgan-train2', 'melgan', source='local')
vocoder.eval()
# create socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
print ('Socket created')
socket_service(FFT_model, AR_model, vocoder)