# Pytorch
# Gradient computation
grad can be implicitly created only for scalar outputs. If output is non-scalar, pass the gradient values from previous values explicitly
a = torch.arange(0,14., requires_grad=True)
# a[None, None]: (14) => (1,1,14)
b = torch.nn.functional.adaptive_avg_pool1d(a[None, None], 4)
b.backward() # throws error
b.backward(torch.arange(1., 1+b.size(-1))[None,None]) # pass dl/db
# Global average pooling
F.adaptive_avg_pool2d(x, 1)
: has the same effect as global average pooling, x is input and 1 is the output size.
it's gradient is 1/n
# Conversion to Tensorflow
onnx to tf using the correct version