| 88 | |
| 89 | |
| 90 | class ExpManager: |
| 91 | |
| 92 | RESULT_FILE = 'result.jsonl' |
| 93 | |
| 94 | def __init__(self): |
| 95 | self.exps = [] |
| 96 | |
| 97 | def assert_gpu_not_overlap(self): |
| 98 | all_gpus = set() |
| 99 | for exp in self.exps: |
| 100 | gpus = exp.runtime['env']['CUDA_VISIBLE_DEVICES'].split(',') |
| 101 | if all_gpus & set(gpus): |
| 102 | raise ValueError(f'GPU overlap: {self.exps}!') |
| 103 | all_gpus.update(gpus) |
| 104 | |
| 105 | def run(self, exp: Experiment): |
| 106 | if os.path.exists(os.path.join(exp.input_args.save_dir, exp.name + '.json')): |
| 107 | with open(os.path.join(exp.input_args.save_dir, exp.name + '.json'), 'r', encoding='utf-8') as f: |
| 108 | _json = json.load(f) |
| 109 | if exp.eval_dataset and 'eval_result' not in _json['record']: |
| 110 | if not exp.do_eval: |
| 111 | logger.info(f'Experiment {exp.name} need eval, load from file.') |
| 112 | exp.load(_json) |
| 113 | exp.do_eval = True |
| 114 | else: |
| 115 | logger.warn(f'Experiment {exp.name} already done, skip') |
| 116 | return |
| 117 | |
| 118 | if exp.do_eval: |
| 119 | runtime = self._build_eval_cmd(exp) |
| 120 | exp.runtime = runtime |
| 121 | envs = deepcopy(runtime.get('env', {})) |
| 122 | envs.update(os.environ) |
| 123 | logger.info(f'Running cmd: {runtime["running_cmd"]}, env: {runtime.get("env", {})}') |
| 124 | os.makedirs('exp', exist_ok=True) |
| 125 | log_file = os.path.join('exp', f'{exp.name}.eval.log') |
| 126 | exp.handler = subprocess.Popen(runtime['running_cmd'] + f' > {log_file} 2>&1', env=envs, shell=True) |
| 127 | self.exps.append(exp) |
| 128 | self.assert_gpu_not_overlap() |
| 129 | return |
| 130 | |
| 131 | if any([exp.name == e.name for e in self.exps]): |
| 132 | raise ValueError(f'Why exp name duplicate? {exp.name}') |
| 133 | elif exp.cmd == 'export' and any([exp.cmd == 'export' for exp in self.exps]): # noqa |
| 134 | raise AssertionError('Cannot run parallel export task.') |
| 135 | else: |
| 136 | exp.create_time = time.time() |
| 137 | runtime = self._build_cmd(exp) |
| 138 | exp.runtime = runtime |
| 139 | envs = deepcopy(runtime.get('env', {})) |
| 140 | envs.update(os.environ) |
| 141 | logger.info(f'Running cmd: {runtime["running_cmd"]}, env: {runtime.get("env", {})}') |
| 142 | os.makedirs('exp', exist_ok=True) |
| 143 | log_file = os.path.join('exp', f'{exp.name}.{exp.cmd}.log') |
| 144 | exp.handler = subprocess.Popen(runtime['running_cmd'] + f' > {log_file} 2>&1', env=envs, shell=True) |
| 145 | self.exps.append(exp) |
| 146 | self.assert_gpu_not_overlap() |
| 147 | |