-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
128 lines (104 loc) · 4.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import html
import os
import pdfplumber
import google.cloud.texttospeech as tts
# TODO implement offset for table of contents, etc
infile = "myPdfFile.pdf"
inpath = "in/"
outpath = "out/"
joinedpath = "joined/"
serviceaccount = "google.json"
language_code = "de-DE"
voice_name = "de-DE-Wavenet-B"
lenght = 0
text = ""
apilengthlimit = 5000
def ssml_to_mp3(voice_name: str, text: str, dest: str, filename: str):
language_code = "-".join(voice_name.split("-")[:2])
text_input = tts.SynthesisInput(ssml=text)
voice_params = tts.VoiceSelectionParams(
language_code=language_code, name=voice_name
)
audio_config = tts.AudioConfig(audio_encoding=tts.AudioEncoding.MP3, sample_rate_hertz=44100)
client = tts.TextToSpeechClient.from_service_account_file(serviceaccount)
response = client.synthesize_speech(
input=text_input, voice=voice_params, audio_config=audio_config
)
filename = f"{filename}.mp3"
with open(dest + filename, "wb") as out:
out.write(response.audio_content)
print(f'Generated speech saved to "{filename}"')
def text_to_ssml(inputfile):
raw_lines = inputfile
# Replace special characters with HTML Ampersand Character Codes
# These Codes prevent the API from confusing text with
# SSML commands
# For example, '<' --> '<' and '&' --> '&'
escaped_lines = html.escape(raw_lines)
# Convert plaintext to SSML
# Wait two seconds between each address
ssml = "<speak>{}</speak>".format(
escaped_lines
.replace("\n\n", '\n\n<break time="0.3s"/>')
.replace(",", ',<break time="0.2s"/>')
.replace(". ", '. <break time="0.3s"/>')
.replace("?", '?<break time="0.3s"/>')
.replace("!", '!<break time="0.3s"/>')
.replace("_", '')
.replace("https://", '')
.replace("http://", '')
.replace("fl", 'fl')
.replace("ff", 'ff')
.replace("fi", 'fi')
.replace("ffi", 'ffi')
.replace("ffl", 'ffl')
.replace("st", 'st')
.replace("ij", 'ij')
.replace("st", 'st')
.replace("æ", 'ae')
.replace("œ", 'oe')
)
# Return the concatenated string of ssml script
return ssml
def list_voices(language_code=None):
client = tts.TextToSpeechClient.from_service_account_file(serviceaccount)
response = client.list_voices(language_code=language_code)
voices = sorted(response.voices, key=lambda voice: voice.name)
print(f" Voices: {len(voices)} ".center(60, "-"))
for voice in voices:
languages = ", ".join(voice.language_codes)
name = voice.name
gender = tts.SsmlVoiceGender(voice.ssml_gender).name
rate = voice.natural_sample_rate_hertz
print(f"{languages:<8} | {name:<24} | {gender:<8} | {rate:,} Hz")
def get_chunks(s, maxlength):
start = 0
end = 0
while start + maxlength < len(s) and end != -1:
end = s.rfind(" ", start, start + maxlength + 1)
if s[(end - len("<break")):end] == "<break":
end -= (len("<break") + 1)
yield s[start:end]
start = end + 1
yield s[start:]
# print(list_voices(language_code))
#
with pdfplumber.open(inpath + infile) as pdf:
for page in pdf.pages:
currenttext = page.extract_text()
if currenttext:
newtext = text_to_ssml(str(page.extract_text()))
text += newtext
lenght += len(newtext)
processedlenght = 0
print(lenght)
chunks = get_chunks(text, apilengthlimit)
for index, chunk in enumerate(chunks):
processedlenght += len(chunk)
print("prodessing", processedlenght, "of", lenght)
print(index, len(chunk), "\n", chunk.replace("\n", ""), "\n")
ssml_to_mp3(voice_name, chunk, outpath, f'{infile.replace(".pdf", "")}_{index}')
filelist = os.listdir(outpath)
filelist.sort(key=lambda x: os.path.getmtime(outpath + x))
os.system(
f'ffmpeg -i "concat:{"|".join([outpath + x for x in filelist])}" -acodec copy {joinedpath}{infile.replace(".pdf", "")}.mp3')