-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathforkAlexNet.m
214 lines (196 loc) · 9.88 KB
/
forkAlexNet.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
function [net] = forkAlexNet(net, new_layers, varargin)
% EXPANDALEXNET basically, construct a graphnn from the AlexNet or VGG-16 by expanding the network dimensions.
% Input:
% NEW_LAYERS (scalar) the number of task-specific layers.
% !!! Not to be confused with the NEW_LAYERS argument, which is the number of added layers on top of that!
% Options:
% See code comments
%
% Authors: Zhizhong Li
%
% See the COPYING file.
opts.mode = 'multiclass'; % type of last layer of the new path
opts.newtaskdim = 20; % # output of last layer
opts.init_scale = 0.01; % scale for randomly initializing new neurons
opts.weightInitMethod = 'gaussian'; % when randomly initializing, gaussian (fixed scale) or glorot (xavier, induce scale based on input/output of the layer)
opts.init_bias = 0; % what to init the biases as
opts.redo_layers = 2; % the number of existing layers to be re-initialized (old weights thrown away)
opts.new_layers = 0; % the number of **additional** layers to be added in the new-task-specific layers
opts.copy_source = 1; % which net.paths{} path should we copy from
opts.orig_loss = 'for_training'; % for_training: softmaxloss, for_eval: softmax, for_keep: custom, else: don't touch
opts.keep_response_loss = 'MI'; % MI for mutual information, L1 for L1-norm. only works when orig_loss is 'for_keep'.
opts.distillation_temp = 2; % only works when keepresponse_method is 'MI'.
opts.origfc_adddropout = true;
opts = vl_argparse(opts, varargin);
networkver = isfield(net, 'paths'); % 1 for vl_graphnn, 0 for vl_simplenn
% modify net to add dropouts... because the given networks do not have them
if opts.origfc_adddropout
assert(networkver == 0, 'graphnn should already have dropout');
n_layers_orig = numel(net.layers);
assert(n_layers_orig == 21 || n_layers_orig == 37);
net.layers{end+1} = struct('type', 'dropout', 'rate', 0.5) ;
net.layers{end+1} = struct('type', 'dropout', 'rate', 0.5) ;
% net.layers = net.layers([ 1:15, 16,17,22, 18,19,23, 20,21 ]); % for AlexNet
net.layers = net.layers([ 1:(n_layers_orig-6), n_layers_orig + [-5,-4,1, -3,-2,2, -1,0] ]); % AlexNet and VGG
end
if networkver == 0, net.paths = {1:numel(net.layers)}; end
n_paths_orig = numel(net.paths);
n_layers_orig = numel(net.paths{opts.copy_source});
assert( isnumeric(new_layers) && numel(new_layers)==1 );
assert(new_layers > 0, 'Must copy SOME layer; otherwise, what are you doing');
assert(opts.redo_layers > 0, 'Assuming the last weight should be replaced.');
% determine num of copied-over new layers
n_layers_conv = 0;
n_layers_copied = 0;
n_layers_redo = 0;
for i=1:n_layers_orig
if strcmp(net.layers{net.paths{opts.copy_source}(end-i+1)}.type, 'conv')
n_layers_conv = n_layers_conv + 1;
end
if n_layers_conv == new_layers && ~n_layers_copied, n_layers_copied = i; end
if n_layers_conv == opts.redo_layers && ~n_layers_redo, n_layers_redo = i; end
if n_layers_copied && n_layers_redo
break;
end
end
assert(n_layers_copied > 0, 'Cannot branch and copy more layers than there are');
assert(n_layers_redo > 0, 'Cannot re-initialize more layers than there are');
assert(n_layers_redo <= n_layers_copied, 'Cannot re-initialize layers that are not branched');
% deal with "for_keep"'s adding a keep response loss layer.
% [ HACK ] Now, only adding the LAST task's keep response loss layer is supported.
% "Last task" means the original task for the first new task, and the last new task for all following new tasks.
if strcmp(opts.orig_loss, 'for_keep')
% task "n+1" [ HACK ] GUESSING WHAT THE LAST LOSS SHOULD BE without specifying it using a input!
% [ HACK ] Now, we guess it's multiclass for 1st and the others are the same as new task!
net.layers{end+1}.type = 'custom';
net.layers{end}.name = 'keepold';
if n_paths_orig > 1 % keeping response for some layer like the new task form!
switch opts.mode
case {'multiclass', 'multiclasshinge'}
layeropts.origstyle = 'multiclass';
case 'multilabel'
layeropts.origstyle = 'multilabel';
end
else % keeping response for some layer like the old task form!
layeropts.origstyle = 'multiclass';
end
layeropts.temperature = opts.distillation_temp;
layeropts.mode = opts.keep_response_loss;
net.layers{end}.forward = getFwdHandle( @vl_nnsoftmaxdiff, layeropts );
net.layers{end}.backward = getBkwdHandle( @vl_nnsoftmaxdiff, layeropts );
% insert before new task
net.paths = { net.paths{:}, [ net.paths{end}(1:end-1), numel(net.layers) ] };
end
% copy layers
n_layers_orig_allpath = numel(net.layers);
net.layers(end+1:end+n_layers_copied) = net.layers(net.paths{opts.copy_source}(end-n_layers_copied+1:end));
net.layers{end}.type = 'softmaxloss';
net.layers{end}.name = 'newtaskloss';
if n_paths_orig > 1, net.layers{end}.name = sprintf('newtask%dloss', n_paths_orig+1); end
% brand new layers to add to the new-task-specific layers
newlayers = {};
for i=1:opts.new_layers
newlayers = [newlayers, constructNewLayer(sprintf('+%d', i), [1 1 4096 4096])]; % only construct them; the weights are initialized later
end
n_layers_new = n_layers_copied + numel(newlayers);
n_layers_redo = n_layers_redo + numel(newlayers);
assert(n_layers_copied >= 2, 'Assuming copying over at least a loss layer and its corresponding weight layer');
net.layers = [ net.layers(1:end-2), newlayers, net.layers(end-1:end) ];
% update paths to reflect newly added path and newly added layers
net.paths = { net.paths{:}, [net.paths{opts.copy_source}(1:(n_layers_orig-n_layers_copied)), n_layers_orig_allpath+(1:n_layers_new)] };
% change new task loss and # classes
new_fc_last = n_layers_orig_allpath+n_layers_new-1; assert(new_fc_last == net.paths{end}(end-1));
switch opts.mode
case {'multiclass', 'multiclasshinge'}
for i = 1:numel(net.layers{new_fc_last}.weights)
weight_size = size(net.layers{new_fc_last}.weights{i});
weight_size(end) = opts.newtaskdim;
net.layers{new_fc_last}.weights{i} = zeros(weight_size, 'single');
end
% type defaults to 'softmaxloss', according to what we did when copying layers
if strcmp(opts.mode, 'multiclasshinge')
net.layers{end}.type = 'custom';
net.layers{end}.name = 'multiclasshinge';
if n_paths_orig > 1, net.layers{end}.name = sprintf('multiclasshinge%d', n_paths_orig+1); end
layeropts.loss = 'mshinge';
net.layers{end}.forward = getFwdHandle( @vl_nnloss_future, layeropts );
net.layers{end}.backward = getBkwdHandle( @vl_nnloss_future, layeropts );
end
case 'multilabel'
for i = 1:numel(net.layers{new_fc_last}.weights)
weight_size = size(net.layers{new_fc_last}.weights{i});
weight_size(end) = opts.newtaskdim;
net.layers{new_fc_last}.weights{i} = zeros(weight_size, 'single');
end
net.layers{end}.type = 'custom';
net.layers{end}.name = 'multilogreg';
if n_paths_orig > 1, net.layers{end}.name = sprintf('multilogreg%d', n_paths_orig+1); end
net.layers{end}.forward = getFwdHandle( @vl_nnlogregloss );
net.layers{end}.backward = getBkwdHandle( @vl_nnlogregloss );
otherwise
end
% rand init (not sure what to do about other fields e.g. momentum)
for i_layers = net.paths{end}(end-n_layers_redo+1:end)
if ~isfield(net.layers{i_layers}, 'weights'), continue; end
switch opts.weightInitMethod
case 'gaussian'
sc = opts.init_scale ;
case 'glorot'
if ~isa(net.layers{i_layers}.weights, 'cell')
sz = size(net.layers{i_layers}.weights);
else
sz = size(net.layers{i_layers}.weights{1});
end
sc = sqrt(1/(sz(1)*sz(2)*(sz(3)+sz(4))/opts.init_scale)) ;
otherwise
throw(MException('forkAlexNet:weightInitMethod','Unrecognized weightInitMethod value'));
end
if ~isa(net.layers{i_layers}.weights, 'cell')
% only deals with "weight only"
% using 'glorot'
net.layers{i_layers}.weights = sc * ...
randn(size(net.layers{i_layers}.weights), 'single');
else
% only deals with "weight+bias combination"
assert(numel(net.layers{i_layers}.weights) == 2);
% using 'glorot'
net.layers{i_layers}.weights{1} = single(sc) * ...
randn(size(net.layers{i_layers}.weights{1}), 'single');
net.layers{i_layers}.weights{2} = single(opts.init_bias) + ...
zeros(size(net.layers{i_layers}.weights{2}), 'single');
end
end
% deal with different loss
switch opts.orig_loss
case 'for_training'
net.layers{n_layers_orig}.type = 'softmaxloss';
net.layers{n_layers_orig}.name = 'loss';
case 'for_eval'
net.layers{n_layers_orig}.type = 'softmax';
net.layers{n_layers_orig}.name = 'prob';
case 'for_keep'
net.layers{n_layers_orig}.type = 'softmaxloss';
net.layers{n_layers_orig}.name = 'origloss';
end
% -------------------------------------------------------------------------
function [ fo ] = getFwdHandle( fi, opts )
if nargin < 2, opts = []; end
fo = @(layer, resi, resi_next) Forward(layer, resi, resi_next, fi, opts);
% -------------------------------------------------------------------------
function [ resi_next ] = Forward(layer, resi, resi_next, fi, opts)
if isempty(opts)
resi_next.x = fi(resi.x, layer.class);
else
resi_next.x = fi(resi.x, layer.class, [], opts);
end
% -------------------------------------------------------------------------
function [ fo ] = getBkwdHandle( fi, opts )
if nargin < 2, opts = []; end
fo = @(layer, resi, resi_next) Backward(layer, resi, resi_next, fi, opts);
% -------------------------------------------------------------------------
function [ resi ] = Backward(layer, resi, resi_next, fi, opts)
if isempty(opts)
resi.dzdx = fi(resi.x, layer.class, resi_next.dzdx);
else
resi.dzdx = fi(resi.x, layer.class, resi_next.dzdx, opts);
end