-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtest_SVHFNet.m
112 lines (87 loc) · 3.64 KB
/
test_SVHFNet.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
function test_SVHFNet(varargin)
% minimal test demo with the SVHFNet model pretrained on
% the VoxCeleb dataset for binary cross-modal matching
% ----------------------------------------------------
% download model
% ----------------------------------------------------
opts.modelPath = '';
modelName = 'static-RGB+VGGFace.mat' ;
paths = {opts.modelPath, ...
modelName, ...
fullfile(vl_rootnn, 'data', 'models-import', modelName)} ;
ok = find(cellfun(@(x) exist(x, 'file'), paths), 1) ;
if isempty(ok)
fprintf('Downloading SVHFNet for binary face-voice matching ... this may take a while\n') ;
opts.modelPath = fullfile(vl_rootnn, 'data/models-import', modelName) ;
mkdir(fileparts(opts.modelPath)) ; base = 'http://www.robots.ox.ac.uk' ;
url = sprintf('%s/~vgg/research/CMBiometrics/models/%s', base, modelName) ;
urlwrite(url, opts.modelPath) ;
else
opts.modelPath = paths{ok} ;
end
netStruct = load(opts.modelPath);
net = dagnn.DagNN.loadobj(netStruct.net);
net.mode = 'test';
% ----------------------------------------------------
% settings
% ----------------------------------------------------
opts.gpu = 1;
opts.numThreads = 4;
opts.imageSize = net.meta.face.normalization.imageSize;
opts.subtractAverage = net.meta.face.normalization.averageImage;
opts.cropSize = 0.85;
opts.dataDir = 'files';
buckets.pool = [2 5 8 11 14 17 20 23 27 30];
buckets.width = [100 200 300 400 500 600 700 800 900 1000];
if ~isempty(opts.gpu), net.move('gpu'); end
net.conserveMemory = false;
% ----------------------------------------------------
% read test files and evaluate
% ----------------------------------------------------
audiopath = fullfile(opts.dataDir, 'audio1.wav');
facepath1 = fullfile(opts.dataDir, 'face1.jpg');
facepath2 = fullfile(opts.dataDir, 'face2.jpg');
inp_a = test_getinput_audio({audiopath},net.meta.voice, buckets);
inp_f1 = test_getinput_face({facepath1}, opts);
inp_f2 = test_getinput_face({facepath2}, opts);
%x contains softmax predictions for both faces
x = evaluate_net_avgpool(net, buckets, gpuArray(inp_f1{1}),gpuArray(inp_f2{1}),gpuArray(inp_a{1})) ;
[score, class] = max(x) ;
fprintf('prediction for triplet | class %d, confidence: %g\n', class, score) ;
function inp = test_getinput_face(images,opts)
args{1} = {images, ...
'NumThreads', opts.numThreads, ...
'Pack', ...
'Interpolation', 'bicubic', ...
'Resize', opts.imageSize(1:2) ...
'CropSize', opts.cropSize
} ;
args{end+1} = {'Gpu'} ;
args{end+1} = {'SubtractAverage', opts.subtractAverage} ;
args = horzcat(args{:}) ;
inp = vl_imreadjpeg(args{:}) ;
end
function inp = test_getinput_audio(images,meta,buckets)
for i = 1: numel(images)
audfile = images{i};
z = audioread(audfile);
SPEC = runSpec(z,meta.audio);
mu = mean(SPEC,2);
stdev = std(SPEC,[],2) ;
nSPEC = bsxfun(@minus, SPEC, mu);
nSPEC = bsxfun(@rdivide, nSPEC, stdev);
rsize = buckets.width(find(buckets.width(:)<=size(nSPEC,2),1,'last'));
rstart = round((size(nSPEC,2)-rsize)/2);
inp{i} = single(nSPEC(:,rstart:rstart+rsize-1));
end
end
function x = evaluate_net_avgpool(net,buckets,image_f1,image_f2,image_v)
s1 = size(image_v,2);
p1 = buckets.pool(s1==buckets.width);
ind1 = strcmp({net.layers(:).name}, 'pool6_voice');
net.layers(ind1).block.poolSize=[1 p1]; % change the average pool layer size depending on the length of the test audio segment
net.vars(net.getVarIndex('softmax')).precious = true;
net.eval({'input_face1',image_f1, 'input_face2', image_f2,'input_voice', image_v});
x = gather(net.vars(net.getVarIndex('softmax')).value);
end
end