For computational efficiency, we do not compute second derivatives.
(self, minibatches, opt, sch)
| 15 | self.args = args |
| 16 | |
| 17 | def update(self, minibatches, opt, sch): |
| 18 | """ |
| 19 | For computational efficiency, we do not compute second derivatives. |
| 20 | """ |
| 21 | num_mb = len(minibatches) |
| 22 | objective = 0 |
| 23 | |
| 24 | opt.zero_grad() |
| 25 | for p in self.network.parameters(): |
| 26 | if p.grad is None: |
| 27 | p.grad = torch.zeros_like(p) |
| 28 | |
| 29 | for (xi, yi), (xj, yj) in random_pairs_of_minibatches_by_domainperm(minibatches): |
| 30 | |
| 31 | xi, yi, xj, yj = xi.cuda().float(), yi.cuda( |
| 32 | ).long(), xj.cuda().float(), yj.cuda().long() |
| 33 | inner_net = copy.deepcopy(self.network) |
| 34 | |
| 35 | inner_opt = get_optimizer(inner_net, self.args, True) |
| 36 | inner_sch = get_scheduler(inner_opt, self.args) |
| 37 | |
| 38 | inner_obj = F.cross_entropy(inner_net(xi), yi) |
| 39 | |
| 40 | inner_opt.zero_grad() |
| 41 | inner_obj.backward() |
| 42 | inner_opt.step() |
| 43 | if inner_sch: |
| 44 | inner_sch.step() |
| 45 | |
| 46 | for p_tgt, p_src in zip(self.network.parameters(), |
| 47 | inner_net.parameters()): |
| 48 | if p_src.grad is not None: |
| 49 | p_tgt.grad.data.add_(p_src.grad.data / num_mb) |
| 50 | |
| 51 | objective += inner_obj.item() |
| 52 | |
| 53 | loss_inner_j = F.cross_entropy(inner_net(xj), yj) |
| 54 | grad_inner_j = autograd.grad(loss_inner_j, inner_net.parameters(), |
| 55 | allow_unused=True) |
| 56 | |
| 57 | objective += (self.args.mldg_beta * loss_inner_j).item() |
| 58 | |
| 59 | for p, g_j in zip(self.network.parameters(), grad_inner_j): |
| 60 | if g_j is not None: |
| 61 | p.grad.data.add_( |
| 62 | self.args.mldg_beta * g_j.data / num_mb) |
| 63 | |
| 64 | objective /= len(minibatches) |
| 65 | |
| 66 | opt.step() |
| 67 | if sch: |
| 68 | sch.step() |
| 69 | return {'total': objective} |
nothing calls this directly
no test coverage detected