-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_segments_and_text.py
108 lines (88 loc) · 3.81 KB
/
create_segments_and_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Author: Judy Fong (Reykjavik University)
# Edits: Inga Rún Helgadóttir (Reykjavik University)
# Description:
# Create segments file from subtitle timestamps
# create a corresponding text file
# create a single segment for each timestamp set
# TODO: normalize the text
# TODO: create corpus file
from itertools import groupby
from decimal import Decimal
import os
import argparse
from pathlib import Path
def parse_arguments():
parser = argparse.ArgumentParser(
description="""Create segments file from subtitle timestamps and a corresponding text file\n
Create a single segment for each timestamp set\n
Usage: python create_segments_and_text.py <input-file> <output-dir>\n
E.g. python local/create_segments_and_text.py data/vtt_transcripts/4886083R7.vtt data
"""
)
parser.add_argument(
"subtitle_file", type=file_path, help="Input subtitle file",
)
parser.add_argument(
"outdir", type=str, help="base path for output files",
)
return parser.parse_args()
def file_path(path):
if os.path.isfile(path):
return path
else:
raise argparse.ArgumentTypeError(f"readable_file:{path} is not a valid file")
# Convert timestamps to seconds and partial seconds so hh:mm:ss.ff
def time_in_seconds(a_time):
hrs, mins, secs = a_time.split(":")
return Decimal(hrs) * 3600 + Decimal(mins) * 60 + Decimal(secs.replace(",", "."))
def no_transcripts(subtitle_path):
HEADER_ONLY = 10
ACCESS_ERROR_ONLY = 36
return os.stat(subtitle_path).st_size in (HEADER_ONLY, ACCESS_ERROR_ONLY)
def create_segm_and_text(subtitle_filename, outdir):
if no_transcripts(subtitle_filename):
print(f"{subtitle_filename} is presumed to not have transcripts")
else:
base = os.path.basename(subtitle_filename)
(filename, ext) = os.path.splitext(base.replace("_", ""))
# Assume the directory of the subtitle file is the show title
show_title = os.path.basename(os.path.dirname(subtitle_filename))
with open(subtitle_filename, "r") as fin, open(
outdir + "/" + filename + ".segments", "w"
) as fseg, open(outdir + "/" + filename + ".text", "w") as ftext:
if ext == ".vtt":
# skip header(first two) lines in file
next(fin)
next(fin)
groups = groupby(fin, str.isspace)
count = 0
for (_, *rest) in (map(str.strip, v) for g, v in groups if not g):
# write to text file
# better to create individual speaker ids per episode or shows,
# more speaker ids, because a global one would create problems for
# cepstral mean normalization ineffective in training
string = " ".join([*rest[1:]])
ftext.write(f"{show_title}-{filename}_{count:05d} {string}\n")
if ext == ".vtt":
start_time, _, end_time, _ = (str(*rest[:1])).split(" ", 3)
elif ext == ".srt":
start_time, _, end_time = (str(*rest[:1])).split(" ", 2)
else:
print(
"The file was not recognized as a subtitle file(\
.vtt or .srt)."
)
exit(1)
start_seconds = time_in_seconds(start_time)
end_seconds = time_in_seconds(end_time)
# write to segments file
fseg.write(
f"{show_title}-{filename}_{count:05d} {filename} {start_seconds} {end_seconds}\n"
)
count = count + 1
def main():
args = parse_arguments()
Path(args.outdir).mkdir(parents=True, exist_ok=True)
create_segm_and_text(args.subtitle_file, args.outdir)
if __name__ == "__main__":
main()