(sctx context.Context, k *kernel.Kernel, cudaCheckpointPath string, cudaProcs []*kernel.ThreadGroup, timeline *timing.Timeline, sequential bool)
| 319 | } |
| 320 | |
| 321 | func toggleCudaProcs(sctx context.Context, k *kernel.Kernel, cudaCheckpointPath string, cudaProcs []*kernel.ThreadGroup, timeline *timing.Timeline, sequential bool) error { |
| 322 | start := time.Now() |
| 323 | |
| 324 | // Open /dev/null once for the stdin of all cuda-checkpoint processes. |
| 325 | nullVD := k.VFS().NewAnonVirtualDentry("null") |
| 326 | defer nullVD.DecRef(sctx) |
| 327 | nullFD, err := memdev.NewNullFD(sctx, nullVD.Mount(), nullVD.Dentry(), vfs.OpenOptions{}) |
| 328 | if err != nil { |
| 329 | log.Warningf("Failed to open /dev/null for cuda-checkpoint stdin: %v", err) |
| 330 | } else { |
| 331 | defer nullFD.DecRef(sctx) |
| 332 | } |
| 333 | |
| 334 | // Call cuda-checkpoint for each CUDA PID. |
| 335 | ckptTimerNames := make([]string, len(cudaProcs)) |
| 336 | for i, cudaProc := range cudaProcs { |
| 337 | ckptTimerNames[i] = fmt.Sprintf("cuda-ckpt %s", cudaProc.ID()) |
| 338 | } |
| 339 | var ckptTimings []*timing.Lease |
| 340 | if !sequential { |
| 341 | ckptTimelines := timeline.MultiFork(ckptTimerNames) |
| 342 | ckptTimings = make([]*timing.Lease, len(cudaProcs)) |
| 343 | for i := range cudaProcs { |
| 344 | ckptTimings[i] = ckptTimelines[i].Lease() |
| 345 | } |
| 346 | } |
| 347 | defer func() { |
| 348 | for _, t := range ckptTimings { |
| 349 | t.End() |
| 350 | } |
| 351 | }() |
| 352 | proc := &Proc{Kernel: k} |
| 353 | ckptProcs := make(map[*kernel.ThreadGroup]checkpointProc) |
| 354 | var errs []error |
| 355 | for i, cudaProc := range cudaProcs { |
| 356 | var ckptTiming *timing.Lease |
| 357 | if sequential { |
| 358 | ckptTiming = timeline.Fork(ckptTimerNames[i]).Lease() |
| 359 | } else { |
| 360 | ckptTiming = ckptTimings[i] |
| 361 | } |
| 362 | ckptProc, cleanup, err := invokeCudaCheckpoint(sctx, k, proc, cudaCheckpointPath, cudaProc, "--toggle", nullFD) |
| 363 | if err != nil { |
| 364 | ckptTiming.Reached("invoke error") |
| 365 | errs = append(errs, err) |
| 366 | break |
| 367 | } |
| 368 | if ckptProc.tg == nil { |
| 369 | ckptTiming.Reached("tg nil") |
| 370 | continue |
| 371 | } |
| 372 | ckptProcs[cudaProc] = ckptProc |
| 373 | ckptTimeline := ckptTiming.Transfer() |
| 374 | if sequential { |
| 375 | ckptProc.tg.WaitExited() |
| 376 | if status := ckptProc.tg.ExitStatus(); status != 0 { |
| 377 | ckptTimeline.Reached("exec error") |
| 378 | } |
no test coverage detected
searching dependent graphs…