(device: int, alloc: int, device_alloc: int, device_free: int)
| 133 | ensure_folder_is_empty(self.folder_name) |
| 134 | |
| 135 | def oom_observer(device: int, alloc: int, device_alloc: int, device_free: int): |
| 136 | # Snapshot right after an OOM happened |
| 137 | log.warning('Out Of Memory (OOM) observed') |
| 138 | |
| 139 | assert self.filename |
| 140 | assert self.folder_name, 'folder_name must be set in init' |
| 141 | filename = Path(self.folder_name) / Path( |
| 142 | format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=state.timestamp), |
| 143 | ) |
| 144 | |
| 145 | try: |
| 146 | self.filename_config = SnapshotFileNameConfig.from_file_name(str(filename)) |
| 147 | log.info(f'Dumping OOMObserver visualizations') |
| 148 | |
| 149 | snapshot = torch.cuda.memory._snapshot() |
| 150 | # No data was recorded - avoids a `ValueError` in `trace_plot` |
| 151 | if all(len(t) == 0 for t in snapshot['device_traces']): |
| 152 | log.info(f'No allocation is recorded in memory snapshot)') |
| 153 | return |
| 154 | |
| 155 | with open(self.filename_config.snapshot_file, 'wb') as fd: |
| 156 | pickle.dump(snapshot, fd) |
| 157 | |
| 158 | with open(self.filename_config.trace_plot_file, 'w+') as fd: |
| 159 | fd.write(torch.cuda._memory_viz.trace_plot(snapshot)) # type: ignore |
| 160 | |
| 161 | with open(self.filename_config.segment_plot_file, 'w+') as fd: |
| 162 | fd.write(torch.cuda._memory_viz.segment_plot(snapshot)) # type: ignore |
| 163 | |
| 164 | with open(self.filename_config.segment_flamegraph_file, 'w+') as fd: |
| 165 | fd.write(torch.cuda._memory_viz.segments(snapshot)) # type: ignore |
| 166 | |
| 167 | with open(self.filename_config.memory_flamegraph_file, 'w+') as fd: |
| 168 | fd.write(torch.cuda._memory_viz.memory(snapshot)) # type: ignore |
| 169 | |
| 170 | log.info(f'Saved memory visualizations to local files with prefix = {filename} during OOM') |
| 171 | |
| 172 | if self.remote_path_in_bucket is not None: |
| 173 | for f in self.filename_config.list_filenames(): |
| 174 | base_file_name = os.path.basename(f) |
| 175 | remote_file_name = os.path.join(self.remote_path_in_bucket, base_file_name) |
| 176 | remote_file_name = remote_file_name.lstrip('/') # remove leading slashes |
| 177 | log.info(f'Uploading memory visualization to remote: {remote_file_name} from {f}') |
| 178 | try: |
| 179 | logger.upload_file(remote_file_name=remote_file_name, file_path=f, overwrite=self.overwrite) |
| 180 | except FileExistsError as e: |
| 181 | raise FileExistsError( |
| 182 | f'Uploading memory visualizations failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory visualizations with Trainer, set save_overwrite to True.', |
| 183 | ) from e |
| 184 | |
| 185 | except Exception as e: |
| 186 | log.error(f'Failed to capture memory snapshot {e}') |
| 187 | |
| 188 | if self._enabled: |
| 189 | torch.cuda.memory._record_memory_history( |
nothing calls this directly
no test coverage detected