-
Notifications
You must be signed in to change notification settings - Fork 4.3k
/
Copy pathlayers.py
1542 lines (1318 loc) · 90.2 KB
/
layers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# ==============================================================================
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
'''
Blocks in the network that are used layer-like, i.e. layered on top of each other
e.g. a fully connected layer with non-linearity.
'''
from __future__ import division
import numpy as np
from ..ops.functions import Function, BlockFunction
from ..variables import Parameter, Record, Constant
import cntk as C
from ..ops import times, convolution, convolution_transpose, pooling, unpooling, batch_normalization, dropout, splice, reshape, sequence, reduce_mean, sqrt
from cntk.internal import _as_tuple
from cntk.cntk_py import sentinel_value_for_auto_select_random_seed as SentinelValueForAutoSelectRandomSeed
from .blocks import _initializer_for, _INFERRED, identity, UntestedBranchError # helpers
from cntk.default_options import is_default_override, get_default_override, default_override_or
def Dense(shape, activation=default_override_or(identity), init=default_override_or(C.glorot_uniform()),
input_rank=None, map_rank=None,
bias=default_override_or(True), init_bias=default_override_or(0),
name=''):
'''
Dense(shape, activation=identity, init=glorot_uniform(), input_rank=None, map_rank=None, bias=True, init_bias=0, name='')
Layer factory function to create an instance of a fully-connected linear layer of the form
`activation(input @ W + b)` with weights `W` and bias `b`, and `activation` and `b` being optional.
`shape` may describe a tensor as well.
A ``Dense`` layer instance owns its parameter tensors `W` and `b`, and exposes them as attributes ``.W`` and ``.b``.
Example:
>>> f = Dense(5, activation=C.relu)
>>> x = C.input_variable(3)
>>> h = f(x)
>>> h.shape
(5,)
>>> f.W.shape
(3, 5)
>>> f.b.value
array([ 0., 0., 0., 0., 0.], dtype=float32)
>>> # activation through default options
>>> with C.default_options(activation=C.relu):
... f = Dense(500)
The ``Dense`` layer can be applied to inputs that are tensors, not just vectors.
This is useful, e.g., at the top of a image-processing cascade, where after many
convolutions with padding and strides it is difficult to know the precise dimensions.
For this case, CNTK has an extended definition of matrix product, in which
the input tensor will be treated as if it had been automatically flattened.
The weight matrix will be a tensor that reflects the "flattened" dimensions in its axes.
Example:
>>> f = Dense(5, activation=C.softmax) # a 5-class classifier
>>> x = C.input_variable((64,16,16)) # e.g. an image reduced by a convolution stack
>>> y = f(x)
>>> y.shape
(5,)
>>> f.W.shape # "row" dimension of "matrix" consists of 3 axes that match the input
(64, 16, 16, 5)
This behavior can be modified by telling CNTK either the number of axes that should not be projected (``map_rank``)
or the rank of the input (``input_rank``). If neither is specified, all input dimensions are
projected, as in the example above.
Example:
>>> f = Dense(5, activation=C.softmax, input_rank=2) # a 5-class classifier
>>> x = C.input_variable((10, 3, 3)) # e.g. 10 parallel 3x3 objects. Input has input_rank=2 axes
>>> y = f(x)
>>> y.shape # the 10 parallel objects are classified separately, the "10" dimension is retained
(10, 5)
>>> f.W.shape # "row" dimension of "matrix" consists of (3,3) matching the input axes to project
(3, 3, 5)
>>> f = Dense(5, activation=C.softmax, map_rank=2)
>>> x = C.input_variable((4, 6, 3, 3, 3)) # e.g. 24 parallel 3x3x3 objects arranged in a 4x6 grid. The grid is to be retained
>>> y = f(x)
>>> y.shape # the 4x6 elements are classified separately, the grid structure is retained
(4, 6, 5)
>>> f.W.shape # "row" dimension of "matrix" consists of (3,3) matching the input axes to project
(3, 3, 3, 5)
>>> z = y([np.zeros(x.shape)])
>>> assert z.shape == (1, 4, 6, 5)
Args:
shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
activation (:class:`~cntk.ops.functions.Function`, defaults to identity): optional function to apply at the end, e.g. `relu`
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
input_rank (int, defaults to `None`): number of inferred axes to add to W (`map_rank` must not be given)
map_rank (int, defaults to `None`): expand W to leave exactly `map_rank` axes (`input_rank` must not be given)
bias (bool, optional, defaults to `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the operation to it
'''
activation = get_default_override(Dense, activation=activation)
init = get_default_override(Dense, init=init)
bias = get_default_override(Dense, bias=bias)
init_bias = get_default_override(Dense, init_bias=init_bias)
output_shape = _as_tuple(shape)
if input_rank is not None and map_rank is not None:
raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")
# determine meaning of axes
# W gets dimension (input_shape + shape)
# where input_shape is determined as:
# - by default, equal to the dimensions of the input passed to Dense()
# - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over)
# - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over)
# where input_rank and map_rank are mutually exclusive.
output_rank = len(output_shape) # support outputs with tensor layouts
# If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank.
# The dimension inference may still create multiple axes.
input_shape = _INFERRED * (input_rank if input_rank is not None else 1)
if input_rank is not None:
infer_input_rank_to_map = -1 # means map_rank is not specified; input_rank rules
elif map_rank is None:
infer_input_rank_to_map = 0 # neither given: default to 'infer W to use all input dims'
else:
infer_input_rank_to_map = map_rank # infer W to use all input dims except the first static 'map_rank' ones
# parameters bound to this Function
init_weights = _initializer_for(init, Record(output_rank=output_rank))
W = Parameter(input_shape + output_shape, init=init_weights, name='W')
b = Parameter( output_shape, init=init_bias, name='b') if bias else None
# expression of this function
@BlockFunction('Dense', name)
def dense(x):
r = times(x, W, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map)
if b:
r = r + b
if activation is not None:
r = activation(r)
return r
return dense
def Embedding(shape=None, init=default_override_or(C.glorot_uniform()), weights=None, name=''):
'''
Embedding(shape=None, init=glorot_uniform(), weights=None, name='')
Layer factory function to create a embedding layer.
An embedding is conceptually a lookup table. For every input token (e.g. a word or any category label), the corresponding
entry in the lookup table is returned.
In CNTK, discrete items such as words are represented as one-hot vectors.
The table lookup is realized as a matrix product, with a matrix
whose rows are the embedding vectors.
Note that multiplying a matrix from the left with a one-hot vector is the same as copying
out the row for which the input vector is 1.
CNTK has special optimizations to make this operation as efficient as an actual table lookup if the input is sparse.
The lookup table in this layer is learnable,
unless a user-specified one is supplied through the ``weights`` parameter.
For example, to use an existing embedding table from a file in numpy format, use this::
Embedding(weights=np.load('PATH.npy'))
To initialize a learnable lookup table with a given numpy array that is to be used as
the initial value, pass that array to the ``init`` parameter (not ``weights``).
An ``Embedding`` instance owns its weight parameter tensor `E`, and exposes it as an attribute ``.E``.
Example:
>>> # learnable embedding
>>> f = Embedding(5)
>>> x = C.input_variable(3)
>>> e = f(x)
>>> e.shape
(5,)
>>> f.E.shape
(3, 5)
>>> # user-supplied embedding
>>> f = Embedding(weights=[[.5, .3, .1, .4, .2], [.7, .6, .3, .2, .9]])
>>> f.E.value
array([[ 0.5, 0.3, 0.1, 0.4, 0.2],
[ 0.7, 0.6, 0.3, 0.2, 0.9]], dtype=float32)
>>> x = C.input_variable(2, is_sparse=True)
>>> e = f(x)
>>> e.shape
(5,)
>>> e(C.Value.one_hot([[1], [0], [0], [1]], num_classes=2))
array([[ 0.7, 0.6, 0.3, 0.2, 0.9],
[ 0.5, 0.3, 0.1, 0.4, 0.2],
[ 0.5, 0.3, 0.1, 0.4, 0.2],
[ 0.7, 0.6, 0.3, 0.2, 0.9]], dtype=float32)
Args:
shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): (learnable embedding only) initial value of weights `E`
weights (NumPy array, mutually exclusive with ``init``, defuats to `None`): (user-supplied embedding only) the lookup table.
The matrix rows are the embedding vectors, ``weights[i,:]`` being the embedding that corresponds to input category `i`.
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the embedding operation to it
'''
if not is_default_override(init) and weights is not None:
raise ValueError('Embedding: init and weights options are mutually exclusive')
# parameters bound to this Function:
# no weights given: learn the embedding
if weights is None:
if shape is None:
raise ValueError('Embedding: output shape must be specified')
init = get_default_override(Embedding, init=init)
shape = _as_tuple(shape)
weight_shape = _INFERRED + shape
E = Parameter(weight_shape, init=init, name='E')
# weights given: use them as constant
else:
import numpy as np
weights = np.array(weights)
weight_shape = np.shape(weights)
if shape is not None: # user may give shape, then it must match
raise ValueError('Embedding: output shape must not be specified when weights are given')
E = Constant(weights, name='E')
# expression
@BlockFunction('Embedding', name)
def embed(x):
return times(x,E)
return embed
def _window(x, axis, begin, end, step, stride, initial_state=None):
'''
helper to expand a sequence into a window, splicing them along the given axis (which must already exist)
'''
shifted = [
sequence.past_value(x, initial_state=initial_state, time_step=-t) if t < 0 else
x if t == 0 else
sequence.future_value(x, initial_state=initial_state, time_step=t)
for t in range(begin, end, step)
]
r = splice(*shifted, axis=axis)
if stride != 1:
raise NotImplementedError('windowed convolution with stride not yet implemented')
return r
# helper to expand options that can be specified as a single value
def _pad_to_shape(filter_shape, param, what):
param = _as_tuple(param)
if len(param) == 1: # broadcast
while len(param) < len(filter_shape):
param = (param[0],) + param
if len(param) != len(filter_shape):
raise ValueError("{} parameter ({}) must be a scalar or have same number of elements as the filter_shape parameter ({})".format(what, param, filter_shape))
return param
# Sequential Convolution -- create a sequential convolution layer with optional non-linearity
# This is the newer version that supports ND sequential convolution with arbitrary strides.
# ( (sample shape) + (output shape) + (reduction shape) + (spatial shape) )
# in : ( (sample shape) + + (reduction shape) + (spatial shape) )
# kernel : ( + (output shape) + (reduction shape) + (rec field shape) )
# out : ( (sample shape) + (output shape) + + (spatial shape) )
def SequentialConvolution(filter_shape, # shape of receptive field, e.g. (3,3). filter_shape[0] is for sequence axis.
num_filters=None, # e.g. 64 or None (which means 1 channel and don't add a dimension)
activation=default_override_or(identity),
init=default_override_or(C.glorot_uniform()),
pad=default_override_or(False),
strides=1,
sharing=True, # (must be True currently)
bias=default_override_or(True),
init_bias=default_override_or(0),
reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image) --TODO: call it item_rank?
transpose_weight=False, # (must be False currently)
dilation=1,
groups = 1,
max_temp_mem_size_in_samples=0,
op_name='Convolution', name=''):
'''
SequentialConvolution(filter_shape, num_filters=None, activation=identity, init=glorot_uniform(), pad=False, strides=1, sharing=True, bias=True, init_bias=0, reduction_rank=1, transpose_weight=False, dilation=1, groups=1, max_temp_mem_size_in_samples=0, op_name='Convolution', name='')
Layer factory function to create a sequential convolution layer.
This implements a sequential convolution operation pretty much almost the same as convolution operation, except that this operation convolves
also on the dynamic axis(sequence), and filter_shape[0] is applied to that axis.
The dimension of the input items (input feature-map depth) is not to be specified. It is known from the input.
The dimension of the output items (output feature-map depth) generated for each item position is given by ``num_filters``.
This is useful for variable-length inputs, such as video
or natural-language processing (word n-grams).
Note, however, that convolution does not support sparse inputs.
Both input and output items can be scalars intead of vectors. For scalar-valued input items,
such as pixels on a black-and-white image, or samples of an audio clip, specify ``reduction_rank=0``.
If the output items are scalar, pass ``num_filters=()`` or ``None``.
A ``Convolution`` instance owns its weight parameter tensors `W` and `b`, and exposes them as an attributes ``.W`` and ``.b``.
The weights will have the shape ``(num_filters, input_feature_map_depth, *filter_shape)``
Example:
>>> # 2D sequential convolution of 5x4 receptive field with output feature-map depth 128:
>>> f = Convolution((5,4), 128, activation=C.relu)
>>> x = C.input_variable(**Sequence[Tensor[3,640]]) # 3-channel color image, the real shape would be [width] x [channel, height], where now width is at sequence axis and can take arbitrary length.
>>> h = f(x)
>>> h.shape
(128, 637)
>>> f.W.shape # will have the form (num_filters, input_depth, *filter_shape)
(128, 3, 5, 4)
>>> # 2D sequential convolution over a one-channel black-and-white image, padding, and stride 2 along width dimension
>>> f = Convolution((3,3), 128, reduction_rank=0, pad=True, strides=(1,2), activation=C.relu)
>>> x = C.input_variable(**Sequence[Tensor[640]]) # similar to the above example, the image height is 640. Image width is arbitrary and takes the sequence axis.
>>> h = f(x)
>>> h.shape
(128, 320)
>>> f.W.shape
(128, 1, 3, 3)
Args:
filter_shape (`int` or `tuple` of `ints`): shape (spatial extent) of the receptive field, *not* including the input feature-map depth. E.g. (3,3) for a 2D convolution.
num_filters (int, defaults to `None`): number of filters (output feature-map depth), or ``()`` to denote scalar output items (output shape will have no depth axis).
activation (:class:`~cntk.ops.functions.Function`, defaults to `identity`): optional function to apply at the end, e.g. `relu`
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
pad (`bool` or `tuple` of `bools`, defaults to `False`): if `False`, then the filter will be shifted over the "valid"
area of input, that is, no value outside the area is used. If ``pad=True`` on the other hand,
the filter will be applied to all input positions, and positions outside the valid region will be considered containing zero.
Use a `tuple` to specify a per-axis value.
strides (`int` or `tuple` of `ints`, defaults to 1): stride of the convolution (increment when sliding the filter over the input). Use a `tuple` to specify a per-axis value.
sharing (bool, defaults to `True`): When `True`, every position uses the same Convolution kernel. When `False`, you can have a different Convolution kernel per position, but `False` is not supported.
bias (bool, optional, defaults to `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
reduction_rank (`int`, defaults to 1): set to 0 if input items are scalars (input has no depth axis), e.g. an audio signal or a black-and-white image
that is stored with tensor shape (H,W) instead of (1,H,W)
transpose_weight (bool, defaults to `False`): When this is `True` this is convolution, otherwise this is correlation (which is common for most toolkits)
dilation (tuple, optional): the dilation value along each axis, default 1 mean no dilation.
groups (`int`, default 1): number of groups during convolution, that controls the connections between input and output channels. Deafult value is 1,
which means that all input channels are convolved to produce all output channels. A value of N would mean that the input (and output) channels are
divided into N groups with the input channels in one group (say i-th input group) contributing to output channels in only one group (i-th output group).
Number of input and output channels must be divisble by value of groups argument. Also, value of this argument must be strictly positive, i.e. groups > 0.
max_temp_mem_size_in_samples (int, defaults to 0): Limits the amount of memory for intermediate convolution results. A value of 0 means, memory is automatically managed.
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the sequential convolution operation to it
'''
activation = get_default_override(SequentialConvolution, activation=activation)
init = get_default_override(SequentialConvolution, init=init)
pad = get_default_override(SequentialConvolution, pad=pad)
bias = get_default_override(SequentialConvolution, bias=bias)
init_bias = get_default_override(SequentialConvolution, init_bias=init_bias)
# tuplify all tuple inputs that can also be given as scalars if rank 1
filter_shape = _as_tuple(filter_shape)
num_filters = _as_tuple(num_filters or ())
filter_rank = len(filter_shape)
strides = _pad_to_shape(filter_shape, strides, 'strides')
sharing = _pad_to_shape(filter_shape, sharing, 'sharing')
pad = _pad_to_shape(filter_shape, pad, 'pad')
dilation = _pad_to_shape(filter_shape, dilation, 'dilation')
if (reduction_rank != 0) and (reduction_rank != 1):
raise NotImplementedError("Convolution: reduction_rank must be 0 or 1")
if transpose_weight:
raise NotImplementedError("Convolution: transpose_weight option currently not supported")
if not sharing:
raise NotImplementedError("Convolution: sharing option currently must be True")
if (groups <= 0):
raise ValueError("Convolution: groups must be strictly positive, i.e. groups > 0.")
if (groups > 1):
raise ValueError("Convolution: groups > 1, is not currently supported by Convolution layer. For group convolution with groups > 1, use CNTK's low-level convolution node (cntk.convolution).")
# The convolution() function currently requires exactly one input and one output depth axis.
# So we emulate those dimensions on this level. TODO: Once this is suppored by the C++ code, remove the emulation here.
emulating_output_depth = num_filters == ()
emulating_input_depth = reduction_rank == 0
actual_output_channels_shape = num_filters if not emulating_output_depth else (1,)
actual_reduction_shape = _INFERRED
actual_filter_shape = filter_shape
# add the dimension to the options as well
num_emulated_axes = emulating_input_depth
strides = (1,) * num_emulated_axes + strides
sharing = (True,) * num_emulated_axes + sharing
pad = (False,) * num_emulated_axes + pad
kernel_shape = actual_reduction_shape + actual_filter_shape # kernel := filter plus reductionDims
# init can be an np.array, which must have the correct dimensions subject to faking depth
# Once we no longer fake depth at this outer level, we can remove this.
if isinstance(init, np.ndarray):
if reduction_rank != 0:
raise ValueError("a constant initializer can currently only used without reduction dimension")
# TODO: Test whether this is needed. We should instead just take whatever reduction dimension is given here as that of the input.
nominal_W_shape = num_filters + filter_shape
if init.shape != nominal_W_shape:
raise ValueError("a constant initializer was passed that is of wrong shape")
init_kernel = init.reshape(actual_output_channels_shape + kernel_shape) # make it fit
else:
init_kernel = _initializer_for(init, Record(filter_rank=filter_rank, output_rank=-len(actual_output_channels_shape)))
# parameters bound to this Function
# For sequential we must reduce bias filter rank by 1, as we get the rank from kernel filter shape, and that contains the seq axis which should be omitted.
bias_filter_rank = len(actual_filter_shape) - 1
W = Parameter(actual_output_channels_shape + kernel_shape, init=init_kernel, name='W') # (K, C, H, W) aka [ W x H x C x K ]
b = Parameter(actual_output_channels_shape + (1,) * bias_filter_rank, init=init_bias, name='b') if bias else None # (K, 1, 1) aka [ 1 x 1 x K ]
# TODO: Should we cater to the special case of 1D convolution for text? I.e. sequential only (filter_shape=()).
# In that case, the convolution is the embedding, and we should use a matrix product to support sparse inputs.
# Or add sparse support to splice().
# expression
@BlockFunction('Convolution', name)
def convolve(x):
# insert additional axes for various purposes
filter_rank_without_seq = filter_rank - 1 # spatial_shape has filter_rank except the first axis of filter_rank belongs to sequential dimension, must subtract
num_inserted_axes = num_emulated_axes # sequential reshape is handled at c++ side now.
if num_inserted_axes != 0:
# x: (in_depth, spatial_shape)
x = reshape(x, (1,) * num_inserted_axes, # e.g. (2000, 480, 640) -> (2000, 1, 480, 640)
begin_axis=-filter_rank_without_seq if filter_rank_without_seq != 0 else C.Axis.new_leading_axis(),
end_axis =-filter_rank_without_seq if filter_rank_without_seq != 0 else None)
# x: (in_depth or emulated_in_depth, emulated_1D_extra, seq_filter_shape, spatial_shape)
# actual convolution
r = convolution (W, x,
strides=strides, sharing=sharing,
auto_padding=pad,
sequential=True,
dilation=dilation,
groups=groups,
max_temp_mem_size_in_samples=max_temp_mem_size_in_samples)
if bias:
r = r + b
# if no output dimension is desired, then strip it
# also need to strip the fake singleton axes, since they are not reduced away
# TODO: We still have those axes in the kernel. Solve this once the C++ implementation supports 1D directly.
num_axes_to_remove = emulating_output_depth
if num_axes_to_remove > 0:
# (out_depth, emulated axes, spatial_shape)
r = reshape(r, (), # e.g. (2000, 1, 480, 640) -> (2000, 480, 640)
begin_axis=-filter_rank_without_seq - num_axes_to_remove, # no need for Axis.new_leading_axis() since expression < 0 guaranteed
end_axis =-filter_rank_without_seq if filter_rank_without_seq != 0 else None)
# (out_depth, spatial_shape)
if activation is not None:
r = activation(r)
return r
return convolve
# Convolution -- create a convolution layer with optional non-linearity
# ( (sample shape) + (output shape) + (reduction shape) + (spatial shape) )
# in : ( (sample shape) + + (reduction shape) + (spatial shape) )
# kernel : ( + (output shape) + (reduction shape) + (rec field shape) )
# out : ( (sample shape) + (output shape) + + (spatial shape) )
# TODO: sharing = false? I'd need that for speech feature extraction.
# TODO: should we allow to pass fixed weights instead? Like for Embedding? E.g. audio filters
# TODO: this is not a convolution but a correlation, and W's shape has input and output depth reverted.
# Transposition of the weight matrix would do the right thing for both cases. Should we default to correctness, i.e. transpose_weight?
# TODO: conflict of parameter order: filter_shape or num_filters first?
# - filter_shape first is logical for non-NN applications such as straight image filtering
# - num_filters first is what Keras does
# TODO: add a test case for passing a numpy array as initial values
def Convolution(filter_shape, # shape of receptive field, e.g. (3,3)
num_filters=None, # e.g. 64 or None (which means 1 channel and don't add a dimension)
sequential=False, # time convolution if True (filter_shape[0] corresponds to dynamic axis)
activation=default_override_or(identity),
init=default_override_or(C.glorot_uniform()),
pad=default_override_or(False),
strides=1,
sharing=True, # (must be True currently)
bias=default_override_or(True),
init_bias=default_override_or(0),
reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image) --TODO: call it item_rank?
transpose_weight=False, # (must be False currently)
dilation=1,
groups = 1,
max_temp_mem_size_in_samples=0,
op_name='Convolution', name=''):
'''
Convolution(filter_shape, num_filters=None, sequential=False, activation=identity, init=glorot_uniform(), pad=False, strides=1, sharing=True, bias=True, init_bias=0, reduction_rank=1, transpose_weight=False, dilation=1, groups=1, max_temp_mem_size_in_samples=0, op_name='Convolution', name='')
Layer factory function to create a convolution layer.
This implements a convolution operation over items arranged on an N-dimensional grid, such as pixels in an image.
Typically, each item is a vector (e.g. pixel: R,G,B), and the result is, in turn, a vector.
The item-grid dimensions are referred to as the *spatial* dimensions (e.g. dimensions of an image),
while the vector dimension of the individual items is often called *feature-map depth*.
For each item, convolution gathers a window ("receptive field") of items surrounding the item's position on the grid,
and applies a little fully-connected network to it (the same little network is applied to all item positions).
The size (spatial extent) of the receptive field is given by ``filter_shape``.
E.g. to specify a 2D convolution, ``filter_shape`` should be a tuple of two integers, such as `(5,5)`;
an example for a 3D convolution (e.g. video or an MRI scan) would be ``filter_shape=(3,3,3)``;
while for a 1D convolution (e.g. audio or text), ``filter_shape`` has one element, such as (3,) or just 3.
The dimension of the input items (input feature-map depth) is not to be specified. It is known from the input.
The dimension of the output items (output feature-map depth) generated for each item position is given by ``num_filters``.
If the input is a sequence, the sequence elements are by default treated independently.
To convolve along the sequence dimension as well, pass ``sequential=True``.
This is useful for variable-length inputs, such as video
or natural-language processing (word n-grams).
Note, however, that convolution does not support sparse inputs.
Both input and output items can be scalars intead of vectors. For scalar-valued input items,
such as pixels on a black-and-white image, or samples of an audio clip, specify ``reduction_rank=0``.
If the output items are scalar, pass ``num_filters=()`` or ``None``.
A ``Convolution`` instance owns its weight parameter tensors `W` and `b`, and exposes them as an attributes ``.W`` and ``.b``.
The weights will have the shape ``(num_filters, input_feature_map_depth, *filter_shape)``
Example:
>>> # 2D convolution of 5x4 receptive field with output feature-map depth 128:
>>> f = Convolution((5,4), 128, activation=C.relu)
>>> x = C.input_variable((3,480,640)) # 3-channel color image
>>> h = f(x)
>>> h.shape
(128, 476, 637)
>>> f.W.shape # will have the form (num_filters, input_depth, *filter_shape)
(128, 3, 5, 4)
>>> # 2D convolution over a one-channel black-and-white image, padding, and stride 2 along width dimension
>>> f = Convolution((3,3), 128, reduction_rank=0, pad=True, strides=(1,2), activation=C.relu)
>>> x = C.input_variable((480,640))
>>> h = f(x)
>>> h.shape
(128, 480, 320)
>>> f.W.shape
(128, 1, 3, 3)
>>> # 3D convolution along dynamic axis over a sequence of 2D color images
>>> from cntk.layers.typing import Sequence, Tensor
>>> f = Convolution((2,5,4), 128, sequential=True, activation=C.relu) # over 2 consecutive frames
>>> x = C.input_variable(**Sequence[Tensor[3,480,640]]) # a variable-length video of 640x480 RGB images
>>> h = f(x)
>>> h.shape # this is the shape per video frame: 637x476 activation vectors of length 128 each
(128, 476, 637)
>>> f.W.shape # (output featuer map depth, input depth, and the three filter dimensions)
(128, 3, 2, 5, 4)
Args:
filter_shape (`int` or `tuple` of `ints`): shape (spatial extent) of the receptive field, *not* including the input feature-map depth. E.g. (3,3) for a 2D convolution.
num_filters (int, defaults to `None`): number of filters (output feature-map depth), or ``()`` to denote scalar output items (output shape will have no depth axis).
sequential (bool, defaults to `False`): if `True`, also convolve along the dynamic axis. ``filter_shape[0]`` corresponds to dynamic axis.
activation (:class:`~cntk.ops.functions.Function`, defaults to `identity`): optional function to apply at the end, e.g. `relu`
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
pad (`bool` or `tuple` of `bools`, defaults to `False`): if `False`, then the filter will be shifted over the "valid"
area of input, that is, no value outside the area is used. If ``pad=True`` on the other hand,
the filter will be applied to all input positions, and positions outside the valid region will be considered containing zero.
Use a `tuple` to specify a per-axis value.
strides (`int` or `tuple` of `ints`, defaults to 1): stride of the convolution (increment when sliding the filter over the input). Use a `tuple` to specify a per-axis value.
sharing (bool, defaults to `True`): When `True`, every position uses the same Convolution kernel. When `False`, you can have a different Convolution kernel per position, but `False` is not supported.
bias (bool, optional, defaults to `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
reduction_rank (`int`, defaults to 1): set to 0 if input items are scalars (input has no depth axis), e.g. an audio signal or a black-and-white image
that is stored with tensor shape (H,W) instead of (1,H,W)
transpose_weight (bool, defaults to `False`): When this is `True` this is convolution, otherwise this is correlation (which is common for most toolkits)
dilation (tuple, optional): the dilation value along each axis, default 1 mean no dilation.
groups (`int`, default 1): number of groups during convolution, that controls the connections between input and output channels. Deafult value is 1,
which means that all input channels are convolved to produce all output channels. A value of N would mean that the input (and output) channels are
divided into N groups with the input channels in one group (say i-th input group) contributing to output channels in only one group (i-th output group).
Number of input and output channels must be divisble by value of groups argument. Also, value of this argument must be strictly positive, i.e. groups > 0.
max_temp_mem_size_in_samples (int, defaults to 0): Limits the amount of memory for intermediate convolution results. A value of 0 means, memory is automatically managed.
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the convolution operation to it
'''
activation = get_default_override(Convolution, activation=activation)
init = get_default_override(Convolution, init=init)
pad = get_default_override(Convolution, pad=pad)
bias = get_default_override(Convolution, bias=bias)
init_bias = get_default_override(Convolution, init_bias=init_bias)
# tuplify all tuple inputs that can also be given as scalars if rank 1
filter_shape = _as_tuple(filter_shape)
num_filters = _as_tuple(num_filters or ())
filter_rank = len(filter_shape)
strides = _pad_to_shape(filter_shape, strides, 'strides')
sharing = _pad_to_shape(filter_shape, sharing, 'sharing')
pad = _pad_to_shape(filter_shape, pad, 'pad')
dilation = _pad_to_shape(filter_shape, dilation, 'dilation')
if (reduction_rank != 0) and (reduction_rank != 1):
raise NotImplementedError("Convolution: reduction_rank must be 0 or 1")
if transpose_weight:
raise NotImplementedError("Convolution: transpose_weight option currently not supported")
if not sharing:
raise NotImplementedError("Convolution: sharing option currently must be True")
if (groups <= 0):
raise ValueError("Convolution: groups must be strictly positive, i.e. groups > 0.")
if (groups > 1):
raise ValueError("Convolution: groups > 1, is not currently supported by Convolution layer. For group convolution with groups > 1, use CNTK's low-level convolution node (cntk.convolution).")
# The convolution() function currently requires exactly one input and one output depth axis.
# So we emulate those dimensions on this level. TODO: Once this is suppored by the C++ code, remove the emulation here.
emulating_output_depth = num_filters == ()
emulating_input_depth = reduction_rank == 0
actual_output_channels_shape = num_filters if not emulating_output_depth else (1,)
actual_reduction_shape = _INFERRED
actual_filter_shape = filter_shape
# add the dimension to the options as well
num_emulated_axes = emulating_input_depth
strides = (1,) * num_emulated_axes + strides
sharing = (True,) * num_emulated_axes + sharing
pad = (False,) * num_emulated_axes + pad
kernel_shape = actual_reduction_shape + actual_filter_shape # kernel := filter plus reductionDims
# init can be an np.array, which must have the correct dimensions subject to faking depth
# Once we no longer fake depth at this outer level, we can remove this.
if isinstance(init, np.ndarray):
if reduction_rank != 0:
raise ValueError("a constant initializer can currently only used without reduction dimension")
# TODO: Test whether this is needed. We should instead just take whatever reduction dimension is given here as that of the input.
nominal_W_shape = num_filters + filter_shape
if init.shape != nominal_W_shape:
raise ValueError("a constant initializer was passed that is of wrong shape")
init_kernel = init.reshape(actual_output_channels_shape + kernel_shape) # make it fit
else:
init_kernel = _initializer_for(init, Record(filter_rank=filter_rank, output_rank=-len(actual_output_channels_shape)))
# parameters bound to this Function
W = Parameter(actual_output_channels_shape + kernel_shape, init=init_kernel, name='W') # (K, C, H, W) aka [ W x H x C x K ]
b = Parameter(actual_output_channels_shape + (1,) * len(actual_filter_shape), init=init_bias, name='b') if bias else None # (K, 1, 1) aka [ 1 x 1 x K ]
# TODO: Should we cater to the special case of 1D convolution for text? I.e. sequential only (filter_shape=()).
# In that case, the convolution is the embedding, and we should use a matrix product to support sparse inputs.
# Or add sparse support to splice().
# expression
@BlockFunction('Convolution', name)
def convolve(x):
# insert additional axes for various purposes
filter_rank_without_seq = filter_rank - sequential # spatial_shape has filter_rank except if sequential: then first axis of filter_rank belongs to sequential dimension, must subtract
num_inserted_axes = sequential + num_emulated_axes
if num_inserted_axes != 0:
# x: (in_depth, spatial_shape)
x = reshape(x, (1,) * num_inserted_axes, # e.g. (2000, 480, 640) -> (2000, 1, 480, 640)
begin_axis=-filter_rank_without_seq if filter_rank_without_seq != 0 else C.Axis.new_leading_axis(),
end_axis =-filter_rank_without_seq if filter_rank_without_seq != 0 else None)
# x: (in_depth or emulated_in_depth, emulated_1D_extra, seq_filter_shape, spatial_shape)
# sequential convolution is implemented through explicit stacking for now, since the C++ cannot handle it
# TODO: if reduction_rank==0 and sequential, we don't need the fake reduction axis, just use the sequential axis instead
if sequential:
lpad = (filter_shape[-filter_rank]-1) // 2 # even frames: take from right; odd frames: symmetric
x = _window(x, axis=-filter_rank, begin=-lpad, end=-lpad+filter_shape[-filter_rank], step=1, stride=strides[-filter_rank], initial_state=None)
# actual convolution
sequential_emulated_axis = len(pad) - filter_rank if sequential else None # static-axis convolution must not pad the simulated sequential dimension (it must reduce to 1)
r = convolution (W, x,
strides=strides, sharing=sharing,
auto_padding=(False,) * reduction_rank # convolution() currently has no reduction_rank parameter, so we must pass an explicit False for the reduction axis
+ tuple(p if i != sequential_emulated_axis else False for i, p in enumerate(pad)),
dilation=dilation,
groups=groups,
max_temp_mem_size_in_samples=max_temp_mem_size_in_samples)
# if sequential and not padding, then strip the extraneous boundary values
if sequential and not pad[-filter_rank]:
r = sequence.slice(r, begin_index=lpad, end_index=-(filter_shape[-filter_rank]-1-lpad))
if bias:
r = r + b
# if no output dimension is desired, then strip it
# also need to strip the fake singleton axes, since they are not reduced away
# TODO: We still have those axes in the kernel. Solve this once the C++ implementation supports 1D directly.
num_axes_to_remove = sequential + emulating_output_depth
if num_axes_to_remove > 0:
# (out_depth, emulated axes, spatial_shape)
r = reshape(r, (), # e.g. (2000, 1, 480, 640) -> (2000, 480, 640)
begin_axis=-filter_rank_without_seq - num_axes_to_remove, # no need for Axis.new_leading_axis() since expression < 0 guaranteed
end_axis =-filter_rank_without_seq if filter_rank_without_seq != 0 else None)
# (out_depth, spatial_shape)
if activation is not None:
r = activation(r)
return r
return convolve
# TODO: make sure the xD versions have all the needed parameters
def Convolution1D(filter_shape, # shape of receptive field, e.g. (3)
num_filters=None, # e.g. 64 or None (which means 1 channel and don't add a dimension)
activation=default_override_or(identity),
init=default_override_or(C.glorot_uniform()),
pad=default_override_or(False),
strides=1,
bias=default_override_or(True),
init_bias=default_override_or(0),
reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
dilation=1,
name=''):
'''
Convolution1D(filter_shape, num_filters=None, activation=identity, init=glorot_uniform(), pad=False, strides=1, bias=True, init_bias=0, reduction_rank=1, name='')
Layer factory function to create a 1D convolution layer with optional non-linearity.
Same as `Convolution()` except that filter_shape is verified to be 1-dimensional.
See `Convolution()` for extensive documentation.
Args:
filter_shape (`int` or `tuple` of `ints`): shape (spatial extent) of the receptive field, *not* including the input feature-map depth. E.g. (3,3) for a 2D convolution.
num_filters (int, defaults to `None`): number of filters (output feature-map depth), or ``()`` to denote scalar output items (output shape will have no depth axis).
activation (:class:`~cntk.ops.functions.Function`, defaults to `identity`): optional function to apply at the end, e.g. `relu`
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
pad (`bool` or `tuple` of `bools`, defaults to `False`): if `False`, then the filter will be shifted over the "valid"
area of input, that is, no value outside the area is used. If ``pad=True`` on the other hand,
the filter will be applied to all input positions, and positions outside the valid region will be considered containing zero.
Use a `tuple` to specify a per-axis value.
strides (`int` or `tuple` of `ints`, defaults to 1): stride of the convolution (increment when sliding the filter over the input). Use a `tuple` to specify a per-axis value.
bias (bool, defaults to `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
reduction_rank (`int`, defaults to 1): set to 0 if input items are scalars (input has no depth axis), e.g. an audio signal or a black-and-white image
that is stored with tensor shape (H,W) instead of (1,H,W)
dilation (tuple, optional): the dilation value along each axis, default 1 mean no dilation.
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the convolution operation to it
'''
activation = get_default_override(Convolution1D, activation=activation)
init = get_default_override(Convolution1D, init=init)
pad = get_default_override(Convolution1D, pad=pad)
bias = get_default_override(Convolution1D, bias=bias)
init_bias = get_default_override(Convolution1D, init_bias=init_bias)
if len(_as_tuple(filter_shape)) != 1:
raise ValueError('Convolution1D: filter_shape must be a scalar')
return Convolution(filter_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, sequential=False, strides=strides, sharing=True, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, dilation=dilation, op_name='Convolution1D', name=name)
def Convolution2D(filter_shape, # shape of receptive field, e.g. (3,3). Must be a 2-element tuple.
num_filters=None, # e.g. 64 or None (which means 1 channel and don't add a dimension)
activation=default_override_or(identity),
init=default_override_or(C.glorot_uniform()),
pad=default_override_or(False),
strides=1,
bias=default_override_or(True),
init_bias=default_override_or(0),
reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
dilation=1,
groups=1,
name=''):
'''
Convolution2D(filter_shape, num_filters=None, activation=identity, init=glorot_uniform(), pad=False, strides=1, bias=True, init_bias=0, reduction_rank=1, name='')
Layer factory function to create a 2D convolution layer with optional non-linearity.
Same as `Convolution()` except that filter_shape is verified to be 2-dimensional.
See `Convolution()` for extensive documentation.
Args:
filter_shape (`int` or `tuple` of `ints`): shape (spatial extent) of the receptive field, *not* including the input feature-map depth. E.g. (3,3) for a 2D convolution.
num_filters (int, defaults to `None`): number of filters (output feature-map depth), or ``()`` to denote scalar output items (output shape will have no depth axis).
activation (:class:`~cntk.ops.functions.Function`, defaults to `identity`): optional function to apply at the end, e.g. `relu`
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
pad (`bool` or `tuple` of `bools`, defaults to `False`): if `False`, then the filter will be shifted over the "valid"
area of input, that is, no value outside the area is used. If ``pad=True`` on the other hand,
the filter will be applied to all input positions, and positions outside the valid region will be considered containing zero.
Use a `tuple` to specify a per-axis value.
strides (`int` or `tuple` of `ints`, defaults to 1): stride of the convolution (increment when sliding the filter over the input). Use a `tuple` to specify a per-axis value.
bias (bool, defaults to `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
reduction_rank (`int`, defaults to 1): set to 0 if input items are scalars (input has no depth axis), e.g. an audio signal or a black-and-white image
that is stored with tensor shape (H,W) instead of (1,H,W)
dilation (tuple, optional): the dilation value along each axis, default 1 mean no dilation.
groups (`int`, default 1): number of groups during convolution, that controls the connections between input and output channels. Deafult value is 1,
which means that all input channels are convolved to produce all output channels. A value of N would mean that the input (and output) channels are
divided into N groups with the input channels in one group (say i-th input group) contributing to output channels in only one group (i-th output group).
Number of input and output channels must be divisble by value of groups argument. Also, value of this argument must be strictly positive, i.e. groups > 0.
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the convolution operation to it
'''
activation = get_default_override(Convolution2D, activation=activation)
init = get_default_override(Convolution2D, init=init)
pad = get_default_override(Convolution2D, pad=pad)
bias = get_default_override(Convolution2D, bias=bias)
init_bias = get_default_override(Convolution2D, init_bias=init_bias)
if len(_as_tuple(filter_shape)) > 2:
raise ValueError('Convolution2D: filter_shape must be a scalar or a 2D tuple, e.g. 3 or (3,3)')
filter_shape = _pad_to_shape((0,0), filter_shape, 'filter_shape')
return Convolution(filter_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, sequential=False,
strides=strides, sharing=True, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank,
dilation=dilation, groups=groups, op_name='Convolution2D', name=name)
def Convolution3D(filter_shape, # shape of receptive field, e.g. (3,3,3). Must be a 3-element tuple.
num_filters=None, # e.g. 64 or None (which means 1 channel and don't add a dimension)
activation=default_override_or(identity),
init=default_override_or(C.glorot_uniform()),
pad=default_override_or(False),
strides=1,
bias=default_override_or(True),
init_bias=default_override_or(0),
reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
dilation=1,
groups=1,
name=''):
'''
Convolution3D(filter_shape, num_filters=None, activation=identity, init=glorot_uniform(), pad=False, strides=1, bias=True, init_bias=0, reduction_rank=1, name='')
Layer factory function to create a 3D convolution layer with optional non-linearity.
Same as `Convolution()` except that filter_shape is verified to be 3-dimensional.
See `Convolution()` for extensive documentation.
Args:
filter_shape (`int` or `tuple` of `ints`): shape (spatial extent) of the receptive field, *not* including the input feature-map depth. E.g. (3,3) for a 2D convolution.
num_filters (int, defaults to `None`): number of filters (output feature-map depth), or ``()`` to denote scalar output items (output shape will have no depth axis).
activation (:class:`~cntk.ops.functions.Function`, defaults to `identity`): optional function to apply at the end, e.g. `relu`
init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
pad (`bool` or `tuple` of `bools`, defaults to `False`): if `False`, then the filter will be shifted over the "valid"
area of input, that is, no value outside the area is used. If ``pad=True`` on the other hand,
the filter will be applied to all input positions, and positions outside the valid region will be considered containing zero.
Use a `tuple` to specify a per-axis value.
strides (`int` or `tuple` of `ints`, defaults to 1): stride of the convolution (increment when sliding the filter over the input). Use a `tuple` to specify a per-axis value.
bias (bool, defaults to `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
reduction_rank (`int`, defaults to 1): set to 0 if input items are scalars (input has no depth axis), e.g. an audio signal or a black-and-white image
that is stored with tensor shape (H,W) instead of (1,H,W)
dilation (tuple, optional): the dilation value along each axis, default 1 mean no dilation.
groups (`int`, default 1): number of groups during convolution, that controls the connections between input and output channels. Deafult value is 1,
which means that all input channels are convolved to produce all output channels. A value of N would mean that the input (and output) channels are
divided into N groups with the input channels in one group (say i-th input group) contributing to output channels in only one group (i-th output group).
Number of input and output channels must be divisble by value of groups argument. Also, value of this argument must be strictly positive, i.e. groups > 0.
name (str, defaults to ''): the name of the function instance in the network
Returns:
cntk.ops.functions.Function:
A function that accepts one argument and applies the convolution operation to it
'''
activation = get_default_override(Convolution3D, activation=activation)
init = get_default_override(Convolution3D, init=init)
pad = get_default_override(Convolution3D, pad=pad)
bias = get_default_override(Convolution3D, bias=bias)
init_bias = get_default_override(Convolution3D, init_bias=init_bias)
if len(_as_tuple(filter_shape)) > 3:
raise ValueError('Convolution3D: filter_shape must be a scalar or a 3D tuple, e.g. 3 or (3,3,3)')
filter_shape = _pad_to_shape((0,0,0), filter_shape, 'filter_shape')
return Convolution(filter_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, sequential=False,
strides=strides, sharing=True, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank,
dilation=dilation, groups=groups, op_name='Convolution3D', name=name)
# ConvolutionTranspose -- create a deconvolution layer with optional non-linearity
# TODO: need to merge with above. Can it simply be transpose=True?
def ConvolutionTranspose(filter_shape, # shape of receptive field, e.g. (3,3)
num_filters,
activation=default_override_or(identity),
init=default_override_or(C.glorot_uniform()),
pad=default_override_or(False),
strides=1,
sharing=True, # (must be True currently)
bias=default_override_or(True),
init_bias=default_override_or(0),
output_shape=None,
reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
dilation = 1,
max_temp_mem_size_in_samples=0,
name=''):
'''
ConvolutionTranspose(filter_shape, num_filters, activation=identity, init=glorot_uniform(), pad=False, strides=1, sharing=True, bias=True, init_bias=0, output_shape=None, reduction_rank=1, max_temp_mem_size_in_samples=0, name='')
Layer factory function to create a convolution transpose layer.
This implements a convolution_transpose operation over items arranged on an N-dimensional grid, such as pixels in an image.
Typically, each item is a vector (e.g. pixel: R,G,B), and the result is, in turn, a vector.
The item-grid dimensions are referred to as the *spatial* dimensions (e.g. dimensions of an image),
while the vector dimensions of the individual items are often called *feature-map depth*.
Convolution transpose is also known as ``fractionally strided convolutional layers``, or, ``deconvolution``.
This operation is used in image and language processing applications. It supports arbitrary
dimensions, strides, and padding.
The forward and backward computation of convolution transpose is the inverse of convolution. That is, during forward
pass the input layer's items are spread into the output same as the backward spread of gradients in convolution. The
backward pass, on the other hand, performs a convolution same as the forward pass of convolution.
The size (spatial extent) of the receptive field for convolution transpose is given by ``filter_shape``.
E.g. to specify a 2D convolution transpose, ``filter_shape`` should be a tuple of two integers, such as `(5,5)`;
an example for a 3D convolution transpose (e.g. video or an MRI scan) would be ``filter_shape=(3,3,3)``;
while for a 1D convolution transpose (e.g. audio or text), ``filter_shape`` has one element, such as (3,).
The dimension of the input items (feature-map depth) is not specified, but known from the input.
The dimension of the output items generated for each item position is given by ``num_filters``.
A ``ConvolutionTranspose`` instance owns its weight parameter tensors `W` and `b`, and exposes them as an attributes ``.W`` and ``.b``.
The weights will have the shape ``(input_feature_map_depth, num_filters, *filter_shape)``.
Example:
>>> # 2D convolution transpose of 3x4 receptive field with output feature-map depth 128:
>>> f = ConvolutionTranspose((3,4), 128, activation=C.relu)
>>> x = C.input_variable((3,480,640)) # 3-channel color image
>>> h = f(x)
>>> h.shape
(128, 482, 643)
>>> f.W.shape # will have the form (input_depth, num_filters, *filter_shape)
(3, 128, 3, 4)
Args:
filter_shape (`int` or tuple of `int`\ s): shape (spatial extent) of the receptive field, *not* including the input feature-map depth. E.g. (3,3) for a 2D convolution.
num_filters (int): number of filters (output feature-map depth), or ``()`` to denote scalar output items (output shape will have no depth axis).
activation (:class:`~cntk.ops.functions.Function`, optional): optional function to apply at the end, e.g. `relu`
init (scalar or :mod:`cntk.initializer`, default :func:`~cntk.initializer.glorot_uniform`): initial value of weights `W`
pad (`bool` or tuple of `bool`\ s, default `False`): if `False`, then the filter will be shifted over the "valid"
area of input, that is, no value outside the area is used. If ``pad=True`` on the other hand,
the filter will be applied to all input positions, and positions outside the valid region will be considered containing zero.
Use a `tuple` to specify a per-axis value.
strides (`int` or tuple of `int`\ s, default 1): stride of the convolution (increment when sliding the filter over the input). Use a `tuple` to specify a per-axis value.
sharing (`bool`, default `True`): weight sharing, must be True for now.
bias (`bool`, optional, default `True`): the layer will have no bias if `False` is passed here
init_bias (scalar or NumPy array or :mod:`cntk.initializer`): initial value of weights `b`
output_shape (`int` or tuple of `int`\ s): output shape. When strides > 2, the output shape is non-deterministic. User can specify the wanted output shape. Note the
specified shape must satisify the condition that if a convolution is perform from the output with the same setting, the result must have same shape as the input.
that is stored with tensor shape (H,W) instead of (1,H,W)
reduction_rank (`int`, defaults to 1): set to 0 if input items are scalars (input has no depth axis), e.g. an audio signal or a black-and-white image.
dilation (tuple, optional): the dilation value along each axis, default 1 mean no dilation.
max_temp_mem_size_in_samples (`int`, default 0): set to a positive number to define the maximum workspace memory for convolution.
name (str, optional): the name of the Function instance in the network
Returns:
:class:`~cntk.ops.functions.Function` that accepts one argument and applies the convolution operation to it
'''
activation = get_default_override(ConvolutionTranspose, activation=activation)
init = get_default_override(ConvolutionTranspose, init=init)
pad = get_default_override(ConvolutionTranspose, pad=pad)
bias = get_default_override(ConvolutionTranspose, bias=bias)
init_bias = get_default_override(ConvolutionTranspose, init_bias=init_bias)
output_shape = get_default_override(ConvolutionTranspose, output_shape=output_shape)
# tuplify all tuple inputs that can also be given as scalars if rank 1
filter_shape = _as_tuple(filter_shape)
num_filters = _as_tuple(num_filters)
strides = _pad_to_shape(filter_shape, strides, 'strides')
sharing = _pad_to_shape(filter_shape, sharing, 'sharing')
pad = _pad_to_shape(filter_shape, pad, 'pad')
dilation = _pad_to_shape(filter_shape, dilation, 'dilation')
if (reduction_rank != 0) and (reduction_rank != 1):
raise NotImplementedError("ConvolutionTranspose: reduction_rank must be 0 or 1")
if not sharing:
NotImplementedError("ConvolutionTranspose: sharing option currently must be True")
emulating_input_depth = reduction_rank == 0
# add the dimension to the options as well
num_emulated_axes = emulating_input_depth
strides = (1,) * num_emulated_axes + strides
sharing = (True,) * num_emulated_axes + sharing
pad = (False,) * num_emulated_axes + pad
output_channels_shape = _as_tuple(num_filters)
kernel_shape = _INFERRED + output_channels_shape + filter_shape # [I × O × m1 × m2 ×… × mn]
output_full_shape = output_shape
if output_shape is not None:
output_full_shape = output_channels_shape + output_shape
filter_rank = len(filter_shape)
init_kernel = _initializer_for(init, Record(filter_rank=filter_rank, output_rank=-1))
W = Parameter(kernel_shape, init=init_kernel, name='W')
b = Parameter(output_channels_shape + (1,) * len(filter_shape), init=init_bias, name='b') if bias else None
# expression
@BlockFunction('ConvolutionTranspose', name)
def convolve_transposed(x):
# insert additional axes for various purposes
num_inserted_axes = num_emulated_axes
if num_inserted_axes != 0:
# x: (in_depth, spatial_shape)
x = reshape(x, (1,) * num_inserted_axes, # e.g. (2000, 480, 640) -> (2000, 1, 480, 640)
begin_axis=-filter_rank if filter_rank != 0 else C.Axis.new_leading_axis(),
end_axis =-filter_rank if filter_rank != 0 else None)
r = convolution_transpose(W, x,
strides=strides,