Function main

examples/graphbolt/disk_based_feature/node_classification.py:399–532 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

397
398
399	def main():
400	torch.set_float32_matmul_precision(args.precision)
401	if not torch.cuda.is_available():
402	args.mode = "cpu-cpu-cpu"
403	print(f"Training in {args.mode} mode.")
404	args.graph_device, args.feature_device, args.device = args.mode.split("-")
405	args.overlap_feature_fetch = args.feature_device == "pinned"
406	args.overlap_graph_fetch = args.graph_device == "pinned"
407
408	"""
409	Load and preprocess on-disk dataset.
410	We inspect the in_memory field of the feature_data in the YAML file and modify
411	it to False. This will make sure the feature_data is loaded as DiskBasedFeature.
412	"""
413	print("Loading data...")
414	disk_based_feature_keys = None
415	if args.cpu_cache_size_in_gigabytes > 0:
416	disk_based_feature_keys = [("node", None, "feat")]
417
418	dataset = gb.BuiltinDataset(args.dataset, root=args.root)
419	if disk_based_feature_keys is None:
420	disk_based_feature_keys = set()
421	for feature in dataset.yaml_data["feature_data"]:
422	feature_key = (feature["domain"], feature["type"], feature["name"])
423	# Set the in_memory setting to False without modifying YAML file.
424	if feature_key in disk_based_feature_keys:
425	feature["in_memory"] = False
426	dataset = dataset.load()
427
428	# Move the dataset to the selected storage.
429	graph = (
430	dataset.graph.pin_memory_()
431	if args.graph_device == "pinned"
432	else dataset.graph.to(args.graph_device)
433	)
434	features = (
435	dataset.feature.pin_memory_()
436	if args.feature_device == "pinned"
437	else dataset.feature.to(args.feature_device)
438	)
439
440	train_set = dataset.tasks[0].train_set
441	valid_set = dataset.tasks[0].validation_set
442	test_set = dataset.tasks[0].test_set
443	all_nodes_set = dataset.all_nodes_set
444	args.fanout = list(map(int, args.fanout.split(",")))
445	num_classes = dataset.tasks[0].metadata["num_classes"]
446
447	"""
448	If the CPU cache size is greater than 0, we wrap the DiskBasedFeature to be
449	a CPUCachedFeature. This internally manages the CPU feature cache by the
450	specified cache replacement policy. This will reduce the amount of data
451	transferred during disk read operations for this feature.
452
453	Note: It is advised to set the CPU cache size to be at least 4 times the number
454	of sampled nodes in a mini-batch, otherwise the feature fetcher might get into
455	a deadlock, causing a hang.
456	"""

Callers 1

node_classification.pyFile · 0.70

Calls 9

load_state_dictMethod · 0.80

create_dataloaderFunction · 0.70

SAGEClass · 0.70

trainFunction · 0.70

layerwise_inferFunction · 0.70

loadMethod · 0.45

pin_memory_Method · 0.45

toMethod · 0.45

sizeMethod · 0.45

Tested by

no test coverage detected