-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsttalign.py
65 lines (46 loc) · 2.12 KB
/
sttalign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re, json, difflib, numpy
def interpolate(data):
# create a numpy array of times with NaN for missing data
start = [datum.get('start', None) for datum in data]
times = numpy.array(start, dtype=numpy.float)
# linearly interpolate the missing times
indicies = numpy.arange(len(times))
notNan = numpy.logical_not(numpy.isnan(times))
timesInterp = numpy.interp(indicies, indicies[notNan], times[notNan])
for i in range(len(data)):
data[i]['start'] = timesInterp[i]
return data
def alignWords(sttData, transcriptWords):
# extract list of words
sttWords=[words.get('word') for words in sttData]
# convert words to lowercase and remove numbers and special characters
sttWordsStripped = [re.sub('[^a-z]', '', word.lower()) for word in sttWords]
transcriptWordsStripped = [re.sub('[^a-z]', '', word.lower()) for word in transcriptWords]
# create empty list to receive data
transcriptData = [{} for _ in range(len(transcriptWords))]
# populate transcriptData with matching words
matcher = difflib.SequenceMatcher(None, sttWordsStripped, transcriptWordsStripped)
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'equal':
transcriptData[j1:j2] = sttData[i1:i2]
# replace words with originals
for i in range(len(transcriptData)):
transcriptData[i]['word'] = transcriptWords[i];
# fill in missing timestamps
return interpolate(transcriptData)
def alignJSON(sttFilename, transcriptFilename):
# load JSON data
with open(sttFilename) as sttFile:
sttData=json.load(sttFile)['words']
with open(transcriptFilename) as transcriptFile:
transcriptText=json.load(transcriptFile)['text']
# align words
aligned = alignWords(sttData, transcriptText.split())
return {'text': transcriptText, 'words': aligned}
def alignJSONText(sttFilename, transcriptText):
# load JSON data
with open(sttFilename) as sttFile:
sttData=json.load(sttFile)['words']
# align words
aligned = alignWords(sttData, transcriptText.split())
return {'text': transcriptText, 'words': aligned}