MCPcopy
hub / github.com/e-p-armstrong/augmentoolkit / example_pipeline

Function example_pipeline

generation/example_pipeline/example.py:39–339  ·  view source on GitHub ↗
(  # NOTE requirement: the pipeline must have the same argument names as the fields in the config.
    use_subset: bool,  # a common parameter that tells us whether to use a subset of the total input (good for testing, dev, cost estimation, and very large input sets)
    subset_size: int,  # a common parameter indicating the number of items from the total read input items to take.
    chunk_size: int,
    input_dir: str,
    concurrency_limit: int,  # how many concurrent requests you want to have active at once. Good for avoiding rate limits.
    small_model: str,
    small_api_key: str,
    small_base_url: str,
    small_mode: str,
    large_model: str,  # NOTE convention: pipeline arguments should be positional, not keyword arguments, when DEFINED (unless they are things like a task ID or seed or do_meta_datagen which is unlikely to appear in the config but we want to take it as an arg and have backward-compatibility with older configs anyway). The reason we use positionals is because that way Python catches us if we miss a critical argument. However for safety/reliability since there are so many args, whenever CALLING pipelines, we use keyword arguments for everything.
    large_api_key: str,
    large_base_url: str,
    large_mode: str,
    output_dir: str,
    default_prompts: str,
    prompts: str,
    completion_mode: bool,
    use_stop: bool,  # Not all APIs (take OpenAI for instance) support more than 4 stop tokens. use_stop is passed to pipeline executions and if it is False, then the number of stop tokens is truncated to 4.
    example_heading,
    key3,
    do_meta_datagen: bool = False,
    meta_datagen_keys: list[str] = [],
    meta_datagen_extras: list[str] = [],
    read_files_manually: bool = True,
    text_chunks_passed_in: list[str] = [],
    cost_per_million_small_input: float = 0.0,
    cost_per_million_small_output: float = 0.0,
    cost_per_million_large_input: float = 0.0,
    cost_per_million_large_output: float = 0.0,
    chunking_output_dir=None,  # Augmentoolkit caches the results of file reading and chunking. If you want your reading/chunking cache dir to be different than your output dir, you can add an option for that.
    task_id=None,  # task_id is a special argument used to help set the progress throughout the pipeline's execution. It's optional for you to add, but if you want a progress bar to show up properly when your pipeline is used with the interface, then you have to add this as well as a few set_progress calls.
    seed=11037,
    **kwargs,  # All pipelines MUST have **kwargs to ensure forward compatibility with new common arguments.
)

Source from the content-addressed store, hash-verified

37
38
39async def example_pipeline( # NOTE requirement: the pipeline must have the same argument names as the fields in the config.
40 use_subset: bool, # a common parameter that tells us whether to use a subset of the total input (good for testing, dev, cost estimation, and very large input sets)
41 subset_size: int, # a common parameter indicating the number of items from the total read input items to take.
42 chunk_size: int,
43 input_dir: str,
44 concurrency_limit: int, # how many concurrent requests you want to have active at once. Good for avoiding rate limits.
45 small_model: str,
46 small_api_key: str,
47 small_base_url: str,
48 small_mode: str,
49 large_model: str, # NOTE convention: pipeline arguments should be positional, not keyword arguments, when DEFINED (unless they are things like a task ID or seed or do_meta_datagen which is unlikely to appear in the config but we want to take it as an arg and have backward-compatibility with older configs anyway). The reason we use positionals is because that way Python catches us if we miss a critical argument. However for safety/reliability since there are so many args, whenever CALLING pipelines, we use keyword arguments for everything.
50 large_api_key: str,
51 large_base_url: str,
52 large_mode: str,
53 output_dir: str,
54 default_prompts: str,
55 prompts: str,
56 completion_mode: bool,
57 use_stop: bool, # Not all APIs (take OpenAI for instance) support more than 4 stop tokens. use_stop is passed to pipeline executions and if it is False, then the number of stop tokens is truncated to 4.
58 example_heading,
59 key3,
60 do_meta_datagen: bool = False,
61 meta_datagen_keys: list[str] = [],
62 meta_datagen_extras: list[str] = [],
63 read_files_manually: bool = True,
64 text_chunks_passed_in: list[str] = [],
65 cost_per_million_small_input: float = 0.0,
66 cost_per_million_small_output: float = 0.0,
67 cost_per_million_large_input: float = 0.0,
68 cost_per_million_large_output: float = 0.0,
69 chunking_output_dir=None, # Augmentoolkit caches the results of file reading and chunking. If you want your reading/chunking cache dir to be different than your output dir, you can add an option for that.
70 task_id=None, # task_id is a special argument used to help set the progress throughout the pipeline's execution. It's optional for you to add, but if you want a progress bar to show up properly when your pipeline is used with the interface, then you have to add this as well as a few set_progress calls.
71 seed=11037,
72 **kwargs, # All pipelines MUST have **kwargs to ensure forward compatibility with new common arguments.
73):
74 # Check if kwargs is not empty and print all keys and values if present
75 if (
76 kwargs
77 ): # NOTE standard anti-footgun measure reminding people of excess args they are passing.
78 print("Additional arguments provided:")
79 for key, value in kwargs.items():
80 print(f" {key}: {value}")
81
82 print("Demonstrating example heading and flattening vs no_flatten")
83 print("Example heading:")
84 print(example_heading)
85 print("key3")
86 print(key3)
87
88 # NOTE the make_relative_to_self calls on the prompts is key so that the prompt directories are not looked for relative to the root of the whole project.
89 # Prompt files are by convention co-located with their pipeline in the same folder. So we need to adjust the paths to make sure they reflect this.
90 default_prompts = make_relative_to_self(default_prompts)
91 prompts = make_relative_to_self(prompts)
92
93 # These two things are used for pipeline cost estimation. This is their initialization.
94 small_token_counter = {
95 "input_tokens": 0,
96 "input_cost": 0.0,

Callers

nothing calls this directly

Calls 14

execute_pipelineMethod · 0.95
make_relative_to_selfFunction · 0.90
create_log_observerFunction · 0.90
set_progressFunction · 0.90
read_and_chunk_textFunction · 0.90
chunk_text_listFunction · 0.90
subset_text_listFunction · 0.90
hash_input_listFunction · 0.90
count_total_tokensFunction · 0.90

Tested by

no test coverage detected