(args)
| 91 | |
| 92 | |
| 93 | def demo_fn(args): |
| 94 | # Print configuration |
| 95 | print("Arguments:", vars(args)) |
| 96 | |
| 97 | # Set seed for reproducibility |
| 98 | np.random.seed(args.seed) |
| 99 | torch.manual_seed(args.seed) |
| 100 | random.seed(args.seed) |
| 101 | if torch.cuda.is_available(): |
| 102 | torch.cuda.manual_seed(args.seed) |
| 103 | torch.cuda.manual_seed_all(args.seed) # for multi-GPU |
| 104 | print(f"Setting seed as: {args.seed}") |
| 105 | |
| 106 | # Set device and dtype |
| 107 | dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 |
| 108 | device = "cuda" if torch.cuda.is_available() else "cpu" |
| 109 | print(f"Using device: {device}") |
| 110 | print(f"Using dtype: {dtype}") |
| 111 | |
| 112 | # Run VGGT for camera and depth estimation |
| 113 | model = VGGT() |
| 114 | _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt" |
| 115 | model.load_state_dict(torch.hub.load_state_dict_from_url(_URL)) |
| 116 | model.eval() |
| 117 | model = model.to(device) |
| 118 | print(f"Model loaded") |
| 119 | |
| 120 | # Get image paths and preprocess them |
| 121 | image_dir = os.path.join(args.scene_dir, "images") |
| 122 | image_path_list = glob.glob(os.path.join(image_dir, "*")) |
| 123 | if len(image_path_list) == 0: |
| 124 | raise ValueError(f"No images found in {image_dir}") |
| 125 | base_image_path_list = [os.path.basename(path) for path in image_path_list] |
| 126 | |
| 127 | # Load images and original coordinates |
| 128 | # Load Image in 1024, while running VGGT with 518 |
| 129 | vggt_fixed_resolution = 518 |
| 130 | img_load_resolution = 1024 |
| 131 | |
| 132 | images, original_coords = load_and_preprocess_images_square(image_path_list, img_load_resolution) |
| 133 | images = images.to(device) |
| 134 | original_coords = original_coords.to(device) |
| 135 | print(f"Loaded {len(images)} images from {image_dir}") |
| 136 | |
| 137 | # Run VGGT to estimate camera and depth |
| 138 | # Run with 518x518 images |
| 139 | extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images, dtype, vggt_fixed_resolution) |
| 140 | points_3d = unproject_depth_map_to_point_map(depth_map, extrinsic, intrinsic) |
| 141 | |
| 142 | if args.use_ba: |
| 143 | image_size = np.array(images.shape[-2:]) |
| 144 | scale = img_load_resolution / vggt_fixed_resolution |
| 145 | shared_camera = args.shared_camera |
| 146 | |
| 147 | with torch.cuda.amp.autocast(dtype=dtype): |
| 148 | # Predicting Tracks |
| 149 | # Using VGGSfM tracker instead of VGGT tracker for efficiency |
| 150 | # VGGT tracker requires multiple backbone runs to query different frames (this is a problem caused by the training process) |
no test coverage detected