r""" Backprop from layer outputs to inputs. Parameters ---------- dLdY : :py:class:`ndarray ` of shape `(n_ex, *)` The gradient of the loss wrt. the layer output `Y`. retain_grads : bool Whether to include the intermedia
(self, dLdY, retain_grads=True)
| 711 | return self.act_fn(out) |
| 712 | |
| 713 | def backward(self, dLdY, retain_grads=True): |
| 714 | r""" |
| 715 | Backprop from layer outputs to inputs. |
| 716 | |
| 717 | Parameters |
| 718 | ---------- |
| 719 | dLdY : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, *)` |
| 720 | The gradient of the loss wrt. the layer output `Y`. |
| 721 | retain_grads : bool |
| 722 | Whether to include the intermediate parameter gradients computed |
| 723 | during the backward pass in the final parameter update. Default is |
| 724 | True. |
| 725 | |
| 726 | Returns |
| 727 | ------- |
| 728 | dX : list of length `n_inputs` |
| 729 | The gradient of the loss wrt. each input in `X`. |
| 730 | """ |
| 731 | if not isinstance(dLdY, list): |
| 732 | dLdY = [dLdY] |
| 733 | |
| 734 | X = self.X |
| 735 | _sum = self.derived_variables["sum"] |
| 736 | grads = [self._bwd(dy, x, ss) for dy, x, ss in zip(dLdY, X, _sum)] |
| 737 | return grads[0] if len(X) == 1 else grads |
| 738 | |
| 739 | def _bwd(self, dLdY, X, _sum): |
| 740 | """Actual computation of gradient of the loss wrt. each input""" |