MCPcopy
hub / github.com/mosaicml/composer / oom_observer

Method oom_observer

composer/callbacks/oom_observer.py:135–186  ·  view source on GitHub ↗
(device: int, alloc: int, device_alloc: int, device_free: int)

Source from the content-addressed store, hash-verified

133 ensure_folder_is_empty(self.folder_name)
134
135 def oom_observer(device: int, alloc: int, device_alloc: int, device_free: int):
136 # Snapshot right after an OOM happened
137 log.warning('Out Of Memory (OOM) observed')
138
139 assert self.filename
140 assert self.folder_name, 'folder_name must be set in init'
141 filename = Path(self.folder_name) / Path(
142 format_name_with_dist_and_time(self.filename, run_name=state.run_name, timestamp=state.timestamp),
143 )
144
145 try:
146 self.filename_config = SnapshotFileNameConfig.from_file_name(str(filename))
147 log.info(f'Dumping OOMObserver visualizations')
148
149 snapshot = torch.cuda.memory._snapshot()
150 # No data was recorded - avoids a `ValueError` in `trace_plot`
151 if all(len(t) == 0 for t in snapshot['device_traces']):
152 log.info(f'No allocation is recorded in memory snapshot)')
153 return
154
155 with open(self.filename_config.snapshot_file, 'wb') as fd:
156 pickle.dump(snapshot, fd)
157
158 with open(self.filename_config.trace_plot_file, 'w+') as fd:
159 fd.write(torch.cuda._memory_viz.trace_plot(snapshot)) # type: ignore
160
161 with open(self.filename_config.segment_plot_file, 'w+') as fd:
162 fd.write(torch.cuda._memory_viz.segment_plot(snapshot)) # type: ignore
163
164 with open(self.filename_config.segment_flamegraph_file, 'w+') as fd:
165 fd.write(torch.cuda._memory_viz.segments(snapshot)) # type: ignore
166
167 with open(self.filename_config.memory_flamegraph_file, 'w+') as fd:
168 fd.write(torch.cuda._memory_viz.memory(snapshot)) # type: ignore
169
170 log.info(f'Saved memory visualizations to local files with prefix = {filename} during OOM')
171
172 if self.remote_path_in_bucket is not None:
173 for f in self.filename_config.list_filenames():
174 base_file_name = os.path.basename(f)
175 remote_file_name = os.path.join(self.remote_path_in_bucket, base_file_name)
176 remote_file_name = remote_file_name.lstrip('/') # remove leading slashes
177 log.info(f'Uploading memory visualization to remote: {remote_file_name} from {f}')
178 try:
179 logger.upload_file(remote_file_name=remote_file_name, file_path=f, overwrite=self.overwrite)
180 except FileExistsError as e:
181 raise FileExistsError(
182 f'Uploading memory visualizations failed with error: {e}. overwrite was set to {self.overwrite}. To overwrite memory visualizations with Trainer, set save_overwrite to True.',
183 ) from e
184
185 except Exception as e:
186 log.error(f'Failed to capture memory snapshot {e}')
187
188 if self._enabled:
189 torch.cuda.memory._record_memory_history(

Callers

nothing calls this directly

Calls 5

from_file_nameMethod · 0.80
list_filenamesMethod · 0.80
writeMethod · 0.45
upload_fileMethod · 0.45

Tested by

no test coverage detected