()
| 655 | |
| 656 | |
| 657 | def main(): |
| 658 | parser = argparse.ArgumentParser(description="Launch a distributed job") |
| 659 | parser.add_argument("--ssh_port", type=int, default=22, help="SSH Port.") |
| 660 | parser.add_argument( |
| 661 | "--ssh_username", |
| 662 | default="", |
| 663 | help="Optional. When issuing commands (via ssh) to cluster, use the provided username in the ssh cmd. " |
| 664 | "Example: If you provide --ssh_username=bob, then the ssh command will be like: 'ssh bob@1.2.3.4 CMD' " |
| 665 | "instead of 'ssh 1.2.3.4 CMD'", |
| 666 | ) |
| 667 | parser.add_argument( |
| 668 | "--workspace", |
| 669 | type=str, |
| 670 | help="Path of user directory of distributed tasks. \ |
| 671 | This is used to specify a destination location where \ |
| 672 | the contents of current directory will be rsyncd", |
| 673 | ) |
| 674 | parser.add_argument( |
| 675 | "--num_trainers", |
| 676 | type=int, |
| 677 | help="The number of trainer processes per machine", |
| 678 | ) |
| 679 | parser.add_argument( |
| 680 | "--num_omp_threads", |
| 681 | type=int, |
| 682 | help="The number of OMP threads per trainer", |
| 683 | ) |
| 684 | parser.add_argument( |
| 685 | "--num_samplers", |
| 686 | type=int, |
| 687 | default=0, |
| 688 | help="The number of sampler processes per trainer process", |
| 689 | ) |
| 690 | parser.add_argument( |
| 691 | "--num_servers", |
| 692 | type=int, |
| 693 | help="The number of server processes per machine", |
| 694 | ) |
| 695 | parser.add_argument( |
| 696 | "--part_config", |
| 697 | type=str, |
| 698 | help="The file (in workspace) of the partition config", |
| 699 | ) |
| 700 | parser.add_argument( |
| 701 | "--ip_config", |
| 702 | type=str, |
| 703 | help="The file (in workspace) of IP configuration for server processes", |
| 704 | ) |
| 705 | parser.add_argument( |
| 706 | "--num_server_threads", |
| 707 | type=int, |
| 708 | default=1, |
| 709 | help="The number of OMP threads in the server process. \ |
| 710 | It should be small if server processes and trainer processes run on \ |
| 711 | the same machine. By default, it is 1.", |
| 712 | ) |
| 713 | parser.add_argument( |
| 714 | "--graph_format", |
no test coverage detected