-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathy_speaker_train_v1.py
125 lines (114 loc) · 5.25 KB
/
y_speaker_train_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import pyaudio
import wave
import soundfile
import librosa
import numpy as np
import os, glob, pickle
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from y_audio_utils import read_sounfile, extract_feature, aug_speed, aug_add_noise, aug_shift_zero, extract_feature2,aug_shift,extract_feature3
import matplotlib.pyplot as plt
def load_data(test_size = 0.2):
x, y = [], []
empty_files = []
for base_path in glob.glob("Dataset_04_07_2020\Dataset\speaker\G*"):#Dataset\speaker\G*"):
print("###################" + base_path.split("\\")[2])
for file in glob.glob(base_path + "\*.wav"):
basename = os.path.basename(file) # get the base name of the audio file
#print("Grupo " + base_path)
speaker = base_path.split("\\")[3]
print(speaker)
# remove empty files (G1)
sound_file = soundfile.SoundFile(file)
if len(sound_file.read(dtype='float32')) == 0:
print("Empty File : " + file)
empty_files.append(file)
continue
# Raw wave
sound_frame, sr = read_sounfile(file)
#plt.plot(sound_frame)
#plt.show()
sound_clipped = librosa.util.fix_length(sound_frame, sr * 2)
# Add Noise
sound_2s = aug_add_noise(sound_clipped)
trimmed, index = librosa.effects.trim(sound_2s, top_db=30)
print(librosa.get_duration(sound_2s,sr), librosa.get_duration(trimmed,sr))
# sd.play(sound_frame, sr)
features = extract_feature3(sound_2s, sr, mfcc=True,chroma=True)
print(len(features))
x.append(features)
y.append(speaker)
i = 1
'''for i in range(1, 8):
frame_shift = aug_shift(trimmed, sr, i)
print(librosa.get_duration(sound_2s,sr), librosa.get_duration(trimmed,sr))
features = extract_feature3(frame_shift, sr, mfcc=True,chroma=True)
print(len(features))
x.append(features)
y.append(speaker)'''
# Add Noise
#frame_noise = aug_add_noise(sound_frame)
#features = extract_feature(frame_noise, sr, mfcc=True, chroma=True, mel=True)
#x.append(features)
#y.append(speaker)
# Pitch
# frame_pitch = aug_pitch(sound_frame,sr,1.2)
# features = extract_feature(frame_pitch, sr, mfcc=True, chroma=True, mel=True)
# x.append(features)
# y.append(speaker)
# Speed Slower
# frame_slower = aug_speed(sound_frame,0.9)
# features = extract_feature_speaker(frame_slower, sr, mfcc=True,chroma=True)
# x.append(features)
# y.append(speaker)
# Speed Faster
# frame_faster = aug_speed(sound_frame,1.1)
# features = extract_feature_speaker(frame_faster, sr, mfcc=True,chroma=True)
# x.append(features)
# y.append(speaker)
return train_test_split(np.array(x), y, test_size=0.2, stratify = y, random_state=True)
X_train, X_test, Y_train, Y_test = load_data(test_size=0.25)
#https://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
transformer = RobustScaler().fit(X_train)
#X_train = transformer.transform(X_train)
#X_test = transformer.transform(X_test)
if not os.path.isdir("utils_mfcc"):
os.mkdir("utils_mfcc")
pickle.dump(scaler, open('utils_mfcc/scaler_speaker_aug_13mfcc.bin', 'wb'))
pickle.dump(transformer, open('utils_mfcc/scaler_speaker_robust_aug_13mfcc.bin', 'wb'))
print("[+] Number of training samples:", X_train.shape[0]) # number of samples in training data
print("[+] Number of testing samples:", X_test.shape[0]) # number of samples in testing data
print("[+] Number of features:", X_train.shape[1]) # number of features used, this is a vector of features extracted using extract_features() function
model = MLPClassifier(alpha=0.0001, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300), learning_rate='adaptive', max_iter=500)
print("[*] Training the model...")
model.fit(X_train,Y_train)
#clf = OneVsRestClassifier(model)
#clf= clf.fit(X_train, Y_train)
Y_predict = model.predict(X_test)
#Y_predict = clf.predict(X_test)
cm= metrics.confusion_matrix(Y_test, Y_predict)
print("Confusion Matrix:")
print(cm)
prfs = metrics.precision_recall_fscore_support(Y_test, Y_predict)
print("Precision Recall Fscor Support:")
print(prfs)
accuracy = metrics.accuracy_score(Y_test,Y_predict)
print("Accuracy:")
print(accuracy)
cr=metrics.classification_report(Y_test,Y_predict)
print("Classification Report:")
print(cr)
# now we save the model
# make result directory if doesn't exist yet
if not os.path.isdir("utils_mfcc"):
os.mkdir("utils_mfcc")
pickle.dump(model, open("utils_mfcc/classifier_speaker_aug_13mfcc.model", "wb"))
stop=0