| 24 | |
| 25 | |
| 26 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, |
| 27 | data_loader: Iterable, optimizer: torch.optim.Optimizer, |
| 28 | device: torch.device, epoch: int, max_norm: float = 0): |
| 29 | model.train() |
| 30 | criterion.train() |
| 31 | metric_logger = utils.MetricLogger(delimiter=" ") |
| 32 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) |
| 33 | metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) |
| 34 | metric_logger.add_meter('grad_norm', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) |
| 35 | header = 'Epoch: [{}]'.format(epoch) |
| 36 | print_freq = 10 |
| 37 | |
| 38 | prefetcher = data_prefetcher(data_loader, device, prefetch=True) |
| 39 | samples, targets = prefetcher.next() |
| 40 | |
| 41 | # for samples, targets in metric_logger.log_every(data_loader, print_freq, header): |
| 42 | for _ in metric_logger.log_every(range(len(data_loader)), print_freq, header): |
| 43 | outputs, pre_outputs, pre_targets = model([samples, targets]) |
| 44 | loss_dict = criterion(outputs, targets, pre_outputs, pre_targets) |
| 45 | weight_dict = criterion.weight_dict |
| 46 | losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) |
| 47 | |
| 48 | # reduce losses over all GPUs for logging purposes |
| 49 | loss_dict_reduced = utils.reduce_dict(loss_dict) |
| 50 | loss_dict_reduced_unscaled = {f'{k}_unscaled': v |
| 51 | for k, v in loss_dict_reduced.items()} |
| 52 | loss_dict_reduced_scaled = {k: v * weight_dict[k] |
| 53 | for k, v in loss_dict_reduced.items() if k in weight_dict} |
| 54 | losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) |
| 55 | |
| 56 | loss_value = losses_reduced_scaled.item() |
| 57 | |
| 58 | if not math.isfinite(loss_value): |
| 59 | print("Loss is {}, stopping training".format(loss_value)) |
| 60 | print(loss_dict_reduced) |
| 61 | sys.exit(1) |
| 62 | |
| 63 | optimizer.zero_grad() |
| 64 | losses.backward() |
| 65 | if max_norm > 0: |
| 66 | grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) |
| 67 | else: |
| 68 | grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm) |
| 69 | optimizer.step() |
| 70 | |
| 71 | metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) |
| 72 | metric_logger.update(class_error=loss_dict_reduced['class_error']) |
| 73 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) |
| 74 | metric_logger.update(grad_norm=grad_total_norm) |
| 75 | |
| 76 | samples, targets = prefetcher.next() |
| 77 | # gather the stats from all processes |
| 78 | metric_logger.synchronize_between_processes() |
| 79 | print("Averaged stats:", metric_logger) |
| 80 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} |
| 81 | |
| 82 | |
| 83 | @torch.no_grad() |