hub / github.com/e-p-armstrong/augmentoolkit / example_pipeline

Function example_pipeline

generation/example_pipeline/example.py:39–339 · view source on GitHub ↗

( # NOTE requirement: the pipeline must have the same argument names as the fields in the config.
use_subset: bool, # a common parameter that tells us whether to use a subset of the total input (good for testing, dev, cost estimation, and very large input sets)
subset_size: int, # a common parameter indicating the number of items from the total read input items to take.
chunk_size: int,
input_dir: str,
concurrency_limit: int, # how many concurrent requests you want to have active at once. Good for avoiding rate limits.
small_model: str,
small_api_key: str,
small_base_url: str,
small_mode: str,
large_model: str, # NOTE convention: pipeline arguments should be positional, not keyword arguments, when DEFINED (unless they are things like a task ID or seed or do_meta_datagen which is unlikely to appear in the config but we want to take it as an arg and have backward-compatibility with older configs anyway). The reason we use positionals is because that way Python catches us if we miss a critical argument. However for safety/reliability since there are so many args, whenever CALLING pipelines, we use keyword arguments for everything.
large_api_key: str,
large_base_url: str,
large_mode: str,
output_dir: str,
default_prompts: str,
prompts: str,
completion_mode: bool,
use_stop: bool, # Not all APIs (take OpenAI for instance) support more than 4 stop tokens. use_stop is passed to pipeline executions and if it is False, then the number of stop tokens is truncated to 4.
example_heading,
key3,
do_meta_datagen: bool = False,
meta_datagen_keys: list[str] = [],
meta_datagen_extras: list[str] = [],
read_files_manually: bool = True,
text_chunks_passed_in: list[str] = [],
cost_per_million_small_input: float = 0.0,
cost_per_million_small_output: float = 0.0,
cost_per_million_large_input: float = 0.0,
cost_per_million_large_output: float = 0.0,
chunking_output_dir=None, # Augmentoolkit caches the results of file reading and chunking. If you want your reading/chunking cache dir to be different than your output dir, you can add an option for that.
task_id=None, # task_id is a special argument used to help set the progress throughout the pipeline's execution. It's optional for you to add, but if you want a progress bar to show up properly when your pipeline is used with the interface, then you have to add this as well as a few set_progress calls.
seed=11037,
**kwargs, # All pipelines MUST have **kwargs to ensure forward compatibility with new common arguments.
)

Source from the content-addressed store, hash-verified

37
38
39	async def example_pipeline( # NOTE requirement: the pipeline must have the same argument names as the fields in the config.
40	use_subset: bool, # a common parameter that tells us whether to use a subset of the total input (good for testing, dev, cost estimation, and very large input sets)
41	subset_size: int, # a common parameter indicating the number of items from the total read input items to take.
42	chunk_size: int,
43	input_dir: str,
44	concurrency_limit: int, # how many concurrent requests you want to have active at once. Good for avoiding rate limits.
45	small_model: str,
46	small_api_key: str,
47	small_base_url: str,
48	small_mode: str,
49	large_model: str, # NOTE convention: pipeline arguments should be positional, not keyword arguments, when DEFINED (unless they are things like a task ID or seed or do_meta_datagen which is unlikely to appear in the config but we want to take it as an arg and have backward-compatibility with older configs anyway). The reason we use positionals is because that way Python catches us if we miss a critical argument. However for safety/reliability since there are so many args, whenever CALLING pipelines, we use keyword arguments for everything.
50	large_api_key: str,
51	large_base_url: str,
52	large_mode: str,
53	output_dir: str,
54	default_prompts: str,
55	prompts: str,
56	completion_mode: bool,
57	use_stop: bool, # Not all APIs (take OpenAI for instance) support more than 4 stop tokens. use_stop is passed to pipeline executions and if it is False, then the number of stop tokens is truncated to 4.
58	example_heading,
59	key3,
60	do_meta_datagen: bool = False,
61	meta_datagen_keys: list[str] = [],
62	meta_datagen_extras: list[str] = [],
63	read_files_manually: bool = True,
64	text_chunks_passed_in: list[str] = [],
65	cost_per_million_small_input: float = 0.0,
66	cost_per_million_small_output: float = 0.0,
67	cost_per_million_large_input: float = 0.0,
68	cost_per_million_large_output: float = 0.0,
69	chunking_output_dir=None, # Augmentoolkit caches the results of file reading and chunking. If you want your reading/chunking cache dir to be different than your output dir, you can add an option for that.
70	task_id=None, # task_id is a special argument used to help set the progress throughout the pipeline's execution. It's optional for you to add, but if you want a progress bar to show up properly when your pipeline is used with the interface, then you have to add this as well as a few set_progress calls.
71	seed=11037,
72	kwargs, # All pipelines MUST have kwargs to ensure forward compatibility with new common arguments.
73	):
74	# Check if kwargs is not empty and print all keys and values if present
75	if (
76	kwargs
77	): # NOTE standard anti-footgun measure reminding people of excess args they are passing.
78	print("Additional arguments provided:")
79	for key, value in kwargs.items():
80	print(f" {key}: {value}")
81
82	print("Demonstrating example heading and flattening vs no_flatten")
83	print("Example heading:")
84	print(example_heading)
85	print("key3")
86	print(key3)
87
88	# NOTE the make_relative_to_self calls on the prompts is key so that the prompt directories are not looked for relative to the root of the whole project.
89	# Prompt files are by convention co-located with their pipeline in the same folder. So we need to adjust the paths to make sure they reflect this.
90	default_prompts = make_relative_to_self(default_prompts)
91	prompts = make_relative_to_self(prompts)
92
93	# These two things are used for pipeline cost estimation. This is their initialization.
94	small_token_counter = {
95	"input_tokens": 0,
96	"input_cost": 0.0,

Callers

nothing calls this directly

Calls 14

execute_pipelineMethod · 0.95

make_relative_to_selfFunction · 0.90

setup_semaphore_and_enginesFunction · 0.90

create_input_token_counterFunction · 0.90

create_output_token_counterFunction · 0.90

create_log_observerFunction · 0.90

set_progressFunction · 0.90

read_and_chunk_textFunction · 0.90

chunk_text_listFunction · 0.90

subset_text_listFunction · 0.90

hash_input_listFunction · 0.90

count_total_tokensFunction · 0.90

Tested by

no test coverage detected