( # NOTE requirement: the pipeline must have the same argument names as the fields in the config.
use_subset: bool, # a common parameter that tells us whether to use a subset of the total input (good for testing, dev, cost estimation, and very large input sets)
subset_size: int, # a common parameter indicating the number of items from the total read input items to take.
chunk_size: int,
input_dir: str,
concurrency_limit: int, # how many concurrent requests you want to have active at once. Good for avoiding rate limits.
small_model: str,
small_api_key: str,
small_base_url: str,
small_mode: str,
large_model: str, # NOTE convention: pipeline arguments should be positional, not keyword arguments, when DEFINED (unless they are things like a task ID or seed or do_meta_datagen which is unlikely to appear in the config but we want to take it as an arg and have backward-compatibility with older configs anyway). The reason we use positionals is because that way Python catches us if we miss a critical argument. However for safety/reliability since there are so many args, whenever CALLING pipelines, we use keyword arguments for everything.
large_api_key: str,
large_base_url: str,
large_mode: str,
output_dir: str,
default_prompts: str,
prompts: str,
completion_mode: bool,
use_stop: bool, # Not all APIs (take OpenAI for instance) support more than 4 stop tokens. use_stop is passed to pipeline executions and if it is False, then the number of stop tokens is truncated to 4.
example_heading,
key3,
do_meta_datagen: bool = False,
meta_datagen_keys: list[str] = [],
meta_datagen_extras: list[str] = [],
read_files_manually: bool = True,
text_chunks_passed_in: list[str] = [],
cost_per_million_small_input: float = 0.0,
cost_per_million_small_output: float = 0.0,
cost_per_million_large_input: float = 0.0,
cost_per_million_large_output: float = 0.0,
chunking_output_dir=None, # Augmentoolkit caches the results of file reading and chunking. If you want your reading/chunking cache dir to be different than your output dir, you can add an option for that.
task_id=None, # task_id is a special argument used to help set the progress throughout the pipeline's execution. It's optional for you to add, but if you want a progress bar to show up properly when your pipeline is used with the interface, then you have to add this as well as a few set_progress calls.
seed=11037,
**kwargs, # All pipelines MUST have **kwargs to ensure forward compatibility with new common arguments.
)
| 37 | |
| 38 | |
| 39 | async def example_pipeline( # NOTE requirement: the pipeline must have the same argument names as the fields in the config. |
| 40 | use_subset: bool, # a common parameter that tells us whether to use a subset of the total input (good for testing, dev, cost estimation, and very large input sets) |
| 41 | subset_size: int, # a common parameter indicating the number of items from the total read input items to take. |
| 42 | chunk_size: int, |
| 43 | input_dir: str, |
| 44 | concurrency_limit: int, # how many concurrent requests you want to have active at once. Good for avoiding rate limits. |
| 45 | small_model: str, |
| 46 | small_api_key: str, |
| 47 | small_base_url: str, |
| 48 | small_mode: str, |
| 49 | large_model: str, # NOTE convention: pipeline arguments should be positional, not keyword arguments, when DEFINED (unless they are things like a task ID or seed or do_meta_datagen which is unlikely to appear in the config but we want to take it as an arg and have backward-compatibility with older configs anyway). The reason we use positionals is because that way Python catches us if we miss a critical argument. However for safety/reliability since there are so many args, whenever CALLING pipelines, we use keyword arguments for everything. |
| 50 | large_api_key: str, |
| 51 | large_base_url: str, |
| 52 | large_mode: str, |
| 53 | output_dir: str, |
| 54 | default_prompts: str, |
| 55 | prompts: str, |
| 56 | completion_mode: bool, |
| 57 | use_stop: bool, # Not all APIs (take OpenAI for instance) support more than 4 stop tokens. use_stop is passed to pipeline executions and if it is False, then the number of stop tokens is truncated to 4. |
| 58 | example_heading, |
| 59 | key3, |
| 60 | do_meta_datagen: bool = False, |
| 61 | meta_datagen_keys: list[str] = [], |
| 62 | meta_datagen_extras: list[str] = [], |
| 63 | read_files_manually: bool = True, |
| 64 | text_chunks_passed_in: list[str] = [], |
| 65 | cost_per_million_small_input: float = 0.0, |
| 66 | cost_per_million_small_output: float = 0.0, |
| 67 | cost_per_million_large_input: float = 0.0, |
| 68 | cost_per_million_large_output: float = 0.0, |
| 69 | chunking_output_dir=None, # Augmentoolkit caches the results of file reading and chunking. If you want your reading/chunking cache dir to be different than your output dir, you can add an option for that. |
| 70 | task_id=None, # task_id is a special argument used to help set the progress throughout the pipeline's execution. It's optional for you to add, but if you want a progress bar to show up properly when your pipeline is used with the interface, then you have to add this as well as a few set_progress calls. |
| 71 | seed=11037, |
| 72 | **kwargs, # All pipelines MUST have **kwargs to ensure forward compatibility with new common arguments. |
| 73 | ): |
| 74 | # Check if kwargs is not empty and print all keys and values if present |
| 75 | if ( |
| 76 | kwargs |
| 77 | ): # NOTE standard anti-footgun measure reminding people of excess args they are passing. |
| 78 | print("Additional arguments provided:") |
| 79 | for key, value in kwargs.items(): |
| 80 | print(f" {key}: {value}") |
| 81 | |
| 82 | print("Demonstrating example heading and flattening vs no_flatten") |
| 83 | print("Example heading:") |
| 84 | print(example_heading) |
| 85 | print("key3") |
| 86 | print(key3) |
| 87 | |
| 88 | # NOTE the make_relative_to_self calls on the prompts is key so that the prompt directories are not looked for relative to the root of the whole project. |
| 89 | # Prompt files are by convention co-located with their pipeline in the same folder. So we need to adjust the paths to make sure they reflect this. |
| 90 | default_prompts = make_relative_to_self(default_prompts) |
| 91 | prompts = make_relative_to_self(prompts) |
| 92 | |
| 93 | # These two things are used for pipeline cost estimation. This is their initialization. |
| 94 | small_token_counter = { |
| 95 | "input_tokens": 0, |
| 96 | "input_cost": 0.0, |
nothing calls this directly
no test coverage detected