()
| 756 | |
| 757 | |
| 758 | def _get_sequence_data_parallel_group(): |
| 759 | global mpu |
| 760 | # When sequence parallelism is enabled, the process group for zero sharding and |
| 761 | # gradient allreduce must be across both dimensions of data and sequence parallelism. |
| 762 | if mpu is not None and hasattr(mpu, 'get_sequence_data_parallel_group'): |
| 763 | return mpu.get_sequence_data_parallel_group() |
| 764 | return _get_data_parallel_group() |
| 765 | |
| 766 | |
| 767 | def _get_expert_model_parallel_world_size(): |
no test coverage detected
searching dependent graphs…