-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloudness_spectral_decomposition.py
293 lines (269 loc) · 13.7 KB
/
loudness_spectral_decomposition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
from scipy.signal import argrelextrema
import numpy as np
import itertools
import librosa
import matplotlib.pyplot as plt
from skimage.segmentation import mark_boundaries
class LoudnessSpectralDecomposition(object):
def __init__(self, audio, fs, threshold=75, num_spectral=5):
"""
init function for Loudness-spectral decomposition
:param audio: numpy array, audio signal
:param fs: int, sample rate
:param threshold: int, threshold in power level to generate new loudness component
:param num_spectral: int, number of spectral components per loudness component
"""
self.audio = audio
self.fs = fs
self.threshold = threshold
self.num_spectral = num_spectral
self.decomposition_type = 'ls'
# components stored in a list of length num_loudness. Each element contains the corresponding spectral
# components in a 3d numpy array. shape: (num_spectral, 128, length_spectrogram)
self.components, self.indices_components, self.loudness = self.initialize_components()
self.num_loudness = len(self.components)
self.num_components = self.num_loudness * self.num_spectral
self.fudged_components = self.initialize_fudged_components()
def initialize_components(self):
"""
initializes the components decomposed according to loudness and in further spectral components
:return: list, length is number of loudness components
each element is a numpy array of shape (num_spectral_comp, spectrogram_length, 128) containing
the spectral components for the audio file
"""
audio = self.audio
indices_min, loudness = self.get_loudness_indices()
temp_components = []
previous = 0
current_index = 0
audio_length = np.size(audio)
number_indices = np.size(indices_min)
if number_indices == 0:
print("No minimum detected, initializing one component")
temp_components.append(self.get_spectral_comp(audio))
else:
while previous < audio_length and current_index < number_indices:
temp_components.append(self.get_spectral_comp(audio[previous:indices_min[current_index]]))
previous = indices_min[current_index]
current_index += 1
temp_components.append(self.get_spectral_comp(audio[previous:]))
return temp_components, indices_min, loudness
def get_loudness_indices(self, min_length=4096):
"""
calculate the indices of the audio array that correspond to minima below self.threshold of the power array
:param min_length: minimum length for each loudness component (to not break the spectrogram retransformation)
:return: indices, loudness power array
"""
loudness = self.compute_power_db()
loudness_rounded = np.around(loudness, decimals=-1)
li = [[k, next(g)[0]] for k, g in itertools.groupby(enumerate(loudness_rounded), key=lambda k: k[1])]
loudness_no_dups = [item[0] for item in li]
indices = [item[1] for item in li]
# get threshold for loudness to start no components, check minima if they are above the threshold, if so: delete
minima = np.array((argrelextrema(np.array(loudness_no_dups), np.less))).flatten()
to_delete = []
for i, m in enumerate(minima):
if loudness[int(indices[int(m)])] > self.threshold:
to_delete.append(i)
elif (i < (len(minima) - 1)) and (int(indices[int(minima[i+1])]) - int(indices[int(m)]) < min_length):
to_delete.append(i)
minima = np.delete(minima, to_delete)
indices_min = []
for k in minima:
indices_min.append(int(indices[int(k)]))
return indices_min, loudness
def get_spectral_comp(self, audio):
"""
decomposes a given audio array into its spectral components
:param audio: np.array, audio
:return: np.array((num_spectral_comp, spectrogram_length, 128))
"""
spectrogram = librosa.feature.melspectrogram(y=audio, sr=self.fs, n_mels=128)
shape_components = (self.num_spectral,) + np.shape(spectrogram)
temp = np.zeros(shape_components)
if 128 % self.num_spectral == 0:
length = 128 / self.num_spectral
for i in range(self.num_spectral):
temp[i, i*length:(i+1)*length, :] = spectrogram[i*length:(i+1)*length, :]
else:
length = int(128 / self.num_spectral + 1)
for i in range(self.num_spectral - 1):
temp[i, i*length:(i+1)*length, :] = spectrogram[i*length:(i+1)*length, :]
# last component
temp[self.num_spectral-1, (self.num_spectral-1)*length:, :] = spectrogram[(self.num_spectral - 1)*length, :]
return temp
def compute_power_db(self, win_len_sec=0.1, power_ref=10**(-12)):
"""Computation of the signal power in dB
Notebook: C1/C1S3_Dynamics.ipynb, retrieved from
https://www.audiolabs-erlangen.de/resources/MIR/FMP/C1/C1S3_Dynamics.html
Accessed 28.09.2021
Args:
x (np.ndarray): Signal (waveform) to be analyzed
fs (scalar): Sampling rate
win_len_sec (float): Length (seconds) of the window (Default value = 0.1)
power_ref (float): Reference power level (0 dB) (Default value = 10**(-12))
Returns:
power_db (np.ndarray): Signal power in dB"""
win_len = round(win_len_sec * self.fs)
win = np.ones(win_len) / win_len
power_db = 10 * np.log10(np.convolve(self.audio**2, win, mode='same') / power_ref)
power_db[np.where(power_db == -np.inf)] = 0 # accounts for errors in calculation because of division by 0
power_db = np.abs(power_db)
return power_db
def initialize_fudged_components(self):
"""
initializes the fudged components that are needed for training the linear model
:return: array of same shape as original components, all set to 0
"""
temp = []
for comp in self.components:
temp.append(np.zeros(np.shape(comp)))
return temp
def get_number_components(self):
"""
:return: int, number of components generated during the decomposition
"""
return self.num_components
def get_components_mask(self, mask, spec=False):
"""
return components for a mask, set to original audio component for true and fudged for false
:param mask: 1D np.array of false and true
:param spec: bool, return spectrogram if true, return converted audio array if false
:return: concatenated fudged and original audio components
"""
# mask: array of false and true, length of num_components
# get components for true and fudged for false
indices = list(range(1, self.num_loudness))
current_mask = mask[0:self.num_spectral]
temp = np.sum(self.components[0][current_mask, :, :], axis=0)
for i in indices:
current_mask = mask[i*self.num_spectral:(i+1)*self.num_spectral]
combined_spec = np.sum(self.components[i][current_mask, :, :], axis=0)
temp = np.concatenate((temp, combined_spec), axis=1)
if spec:
return temp
temp_audio = librosa.feature.inverse.mel_to_audio(temp)
return temp_audio
def return_components(self, indices):
"""
return audio array for given component indices, all other components set to 0
:param indices: list of indices for which to return the original audio components
:return: audio
"""
# make mask setting true for indices
mask = np.zeros((self.num_components,)).astype(bool)
mask[indices] = True
audio = self.get_components_mask(mask)
return audio
def return_mask_boundaries(self, positive_indices, negative_indices):
"""
makes a mask with the indices marking the location of all components in positive_indices set to 1 and the
negative components to -1
:param positive_indices: list, contains indices of positive components
:param negative_indices: list, contains indices of negative components
:return: 2d numpy array, marks components in spectrogram
"""
temp = self.components[0][0, :, :]
indices_l = [0, np.shape(temp)[1]]
for i in range(1, self.num_loudness):
combined_spec = self.components[i][0, :, :]
temp = np.concatenate((temp, combined_spec), axis=1)
indices_l.append(np.shape(temp)[1])
mask = np.zeros(np.shape(temp), dtype=np.byte)
# self.components, self.indices_components
# self.num_spectral, self.num_loudness
if 128 % self.num_spectral == 0:
len_s = 128 / self.num_spectral
else:
len_s = int(128 / self.num_spectral + 1)
for i in range(self.num_components):
if i in positive_indices:
i_s = i % self.num_spectral
i_l = int(i / self.num_spectral)
if (i_s == (self.num_spectral - 1)) and (i_l == (self.num_loudness - 1)):
# last comps
mask[(i_s*len_s+1):-1, indices_l[i_l]+1:-1] = 1
elif i_s == (self.num_spectral - 1):
mask[i_s*len_s+1:-1, indices_l[i_l]+1:indices_l[i_l+1]-1] = 1
elif i_l == (self.num_loudness - 1):
mask[i_s*len_s+1:((i_s+1)*len_s-1), indices_l[i_l]+1:-1] = 1
else:
mask[i_s*len_s+1:((i_s+1)*len_s-1), indices_l[i_l]+1:indices_l[i_l+1]-1] = 1
elif i in negative_indices:
i_s = i % self.num_spectral
i_l = int(i / self.num_spectral)
if (i_s == (self.num_spectral - 1)) and (i_l == (self.num_loudness - 1)):
# last comps
mask[(i_s*len_s+1):-1, indices_l[i_l]+1:-1] = 1
elif i_s == (self.num_spectral - 1):
mask[i_s*len_s+1:-1, indices_l[i_l]+1:indices_l[i_l+1]-1] = -1
elif i_l == (self.num_loudness - 1):
mask[i_s*len_s+1:((i_s+1)*len_s-1), indices_l[i_l]+1:-1] = -1
else:
mask[i_s*len_s+1:((i_s+1)*len_s-1), indices_l[i_l]+1:indices_l[i_l+1]-1] = -1
return mask
def visualize_decomp(self, save_path=None):
"""
visualizes the generated components in a spectrogram image
:param save_path: if not None, path to save the generated image
:return: nothing, shows and possibly saves image
"""
indices = [0] + self.indices_components + [np.size(self.audio)]
loudness = self.loudness
fig, (ax1, ax2) = plt.subplots(2, figsize=(6, 6), gridspec_kw={'height_ratios': [2, 1]})
# get combined spectrogram
# get a mask for the spectrogram
spectrogram_indices = range(self.num_components)
mask = self.return_mask_boundaries(spectrogram_indices, [])
mask_s = np.ones(self.num_components).astype(bool)
spectrogram = self.get_components_mask(mask_s, spec=True)
spec_db = librosa.power_to_db(spectrogram, ref=np.max)
marked = mark_boundaries(spec_db, mask)
img = ax1.imshow(marked[:, :, 2], origin="lower", cmap=plt.get_cmap("magma"))
fig.colorbar(img, ax=ax1)
ax1.set_xlabel("Time")
ax1.set_ylabel("Frequency")
ax1.axes.xaxis.set_ticks([])
ax1.axes.yaxis.set_ticks([])
fig.suptitle('Spectral-Loudness Decomposition with 5\nspectral components and threshold=75')
ax2.plot(loudness, color='c')
for line in indices:
ax2.axvline(x=line, color='m')
ax2.set(xlabel='Time', ylabel='Power (db)', xlim=[0, np.size(self.audio)])
for i in range(0, len(indices) - 1, 2):
ax2.axvspan(indices[i], indices[i+1], facecolor='m', alpha=0.1)
if save_path is not None:
plt.savefig(save_path)
plt.show()
print("visualized :)")
def return_weighted_components(self, used_features, weights):
"""
return audio with ls components weighted according to their absolute importance
:param used_features: array of indices of features to include
:param weights: array of their corresponding weights
:return: 1d array with weighted audio
"""
# normalize weights
sum_weights = np.sum(np.abs(weights))
weights = np.abs(weights) / sum_weights
mask_weights = np.zeros((self.num_components,))
mask_include = np.zeros((self.num_components,)).astype(bool)
# make weighted sum
for index, feature in enumerate(used_features):
mask_weights[feature] = weights[index]
mask_include[feature] = True
indices = list(range(1, self.num_loudness))
current_mask = mask_include[0:self.num_spectral]
temp = np.zeros(np.shape(np.sum(self.components[0][current_mask, :, :], axis=0)))
for comp in range(self.num_spectral):
if mask_weights[comp] != 0:
temp += abs(mask_weights[comp]) * self.components[0][comp, :, :]
for i in indices:
current_mask = mask_include[i*self.num_spectral:(i+1)*self.num_spectral]
combined_spec = np.zeros(np.shape(np.sum(self.components[i][current_mask, :, :], axis=0)))
for comp in range(self.num_spectral):
if mask_weights[comp] != 0:
combined_spec += abs(mask_weights[comp]) * self.components[i][comp, :, :]
temp = np.concatenate((temp, combined_spec), axis=1)
temp_audio = librosa.feature.inverse.mel_to_audio(temp)
return temp_audio