MCPcopy Index your code
hub / github.com/pytorch/pytorch / test_ckpt_save_failure

Method test_ckpt_save_failure

caffe2/python/checkpoint_test.py:241–263  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

239 shutil.rmtree(tmpdir)
240
241 def test_ckpt_save_failure(self):
242 num_nodes = 3
243 # The goal of this test is to ensure that the job runs
244 # successfully even if saving a checkpoint fails.
245 # Hence tmpdir is a non existent directory to emulate a failure
246 # while saving checkpoints
247 tmpdir = "/tmp/path_does_not_exist/"
248
249 # Check the saving checkpoint failure does not cause job failure
250 workspace.ResetWorkspace()
251 for node_id in range(num_nodes):
252 ws = workspace.C.Workspace()
253 session = LocalSession(ws)
254 checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
255 with Cluster():
256 with Job() as job:
257 build_pipeline(node_id)
258 job.compile(LocalSession)
259 job_runner = JobRunner(job, checkpoint)
260 num_epochs = job_runner.train(session)
261 # make sure all epochs are executed even though saving the checkpoint failed
262 # Saving checkpoint failure should not cause job failure
263 self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
264
265 def test_download_group_simple(self):
266 """

Callers

nothing calls this directly

Calls 10

trainMethod · 0.95
LocalSessionClass · 0.90
ClusterClass · 0.90
JobClass · 0.90
JobRunnerClass · 0.90
build_pipelineFunction · 0.85
rangeFunction · 0.50
compileMethod · 0.45
assertEqualMethod · 0.45

Tested by

no test coverage detected