r"""Node embedding optimizer using the Adam algorithm. This optimizer implements a sparse version of Adagrad algorithm for optimizing :class:`dgl.nn.NodeEmbedding`. Being sparse means it only updates the embeddings whose gradients have updates, which are usually a very small portion
| 645 | |
| 646 | |
| 647 | class SparseAdam(SparseGradOptimizer): |
| 648 | r"""Node embedding optimizer using the Adam algorithm. |
| 649 | |
| 650 | This optimizer implements a sparse version of Adagrad algorithm for |
| 651 | optimizing :class:`dgl.nn.NodeEmbedding`. Being sparse means it only |
| 652 | updates the embeddings whose gradients have updates, which are usually |
| 653 | a very small portion of the total embeddings. |
| 654 | |
| 655 | Adam maintains a :math:`Gm_{t,i,j}` and `Gp_{t,i,j}` for every parameter |
| 656 | in the embeddings, where |
| 657 | :math:`Gm_{t,i,j}=beta1 * Gm_{t-1,i,j} + (1-beta1) * g_{t,i,j}`, |
| 658 | :math:`Gp_{t,i,j}=beta2 * Gp_{t-1,i,j} + (1-beta2) * g_{t,i,j}^2`, |
| 659 | :math:`g_{t,i,j} = lr * Gm_{t,i,j} / (1 - beta1^t) / \sqrt{Gp_{t,i,j} / (1 - beta2^t)}` and |
| 660 | :math:`g_{t,i,j}` is the gradient of the dimension :math:`j` of embedding :math:`i` |
| 661 | at step :math:`t`. |
| 662 | |
| 663 | NOTE: The support of sparse Adam optimizer is experimental. |
| 664 | |
| 665 | Parameters |
| 666 | ---------- |
| 667 | params : list[dgl.nn.NodeEmbedding] |
| 668 | The list of dgl.nn.NodeEmbeddings. |
| 669 | lr : float |
| 670 | The learning rate. |
| 671 | betas : tuple[float, float], Optional |
| 672 | Coefficients used for computing running averages of gradient and its square. |
| 673 | Default: (0.9, 0.999) |
| 674 | eps : float, Optional |
| 675 | The term added to the denominator to improve numerical stability |
| 676 | Default: 1e-8 |
| 677 | use_uva : bool, Optional |
| 678 | Whether to use pinned memory for storing 'mem' and 'power' parameters, |
| 679 | when the embedding is stored on the CPU. This will improve training |
| 680 | speed, but will require locking a large number of virtual memory pages. |
| 681 | For embeddings which are stored in GPU memory, this setting will have |
| 682 | no effect. |
| 683 | Default: True if the gradients are generated on the GPU, and False |
| 684 | if the gradients are on the CPU. |
| 685 | dtype : torch.dtype, Optional |
| 686 | The type to store optimizer state with. Default: th.float32. |
| 687 | |
| 688 | Examples |
| 689 | -------- |
| 690 | >>> def initializer(emb): |
| 691 | th.nn.init.xavier_uniform_(emb) |
| 692 | return emb |
| 693 | >>> emb = dgl.nn.NodeEmbedding(g.num_nodes(), 10, 'emb', init_func=initializer) |
| 694 | >>> optimizer = dgl.optim.SparseAdam([emb], lr=0.001) |
| 695 | >>> for blocks in dataloader: |
| 696 | ... ... |
| 697 | ... feats = emb(nids, gpu_0) |
| 698 | ... loss = F.sum(feats + 1, 0) |
| 699 | ... loss.backward() |
| 700 | ... optimizer.step() |
| 701 | """ |
| 702 | |
| 703 | def __init__( |
| 704 | self, |
no outgoing calls