(
samples: str,
eofs: List[str] = [],
inplace: bool = False,
rm_prefix_lines: str = None,
debug_task: str = None,
mbpp_version: str = "default",
)
| 117 | |
| 118 | |
| 119 | def script( |
| 120 | samples: str, |
| 121 | eofs: List[str] = [], |
| 122 | inplace: bool = False, |
| 123 | rm_prefix_lines: str = None, |
| 124 | debug_task: str = None, |
| 125 | mbpp_version: str = "default", |
| 126 | ): |
| 127 | # task_id -> entry_point |
| 128 | entry_point = {} |
| 129 | dataset = {**get_human_eval_plus(), **get_mbpp_plus(version=mbpp_version)} |
| 130 | |
| 131 | for task_id, problem in dataset.items(): |
| 132 | entry_point[task_id] = problem["entry_point"] |
| 133 | |
| 134 | # make a new folder with "-sanitized" suffix |
| 135 | is_folder = os.path.isdir(samples) |
| 136 | target_path = pathlib.Path(samples) |
| 137 | if not inplace: |
| 138 | if is_folder: |
| 139 | new_name = target_path.name + "-sanitized" |
| 140 | else: |
| 141 | new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl") |
| 142 | target_path = target_path.parent / new_name |
| 143 | target_path = str(target_path) |
| 144 | |
| 145 | nsan = 0 |
| 146 | ntotal = 0 |
| 147 | |
| 148 | new_solutions = [] |
| 149 | |
| 150 | for solution in tqdm(load_solutions(samples)): |
| 151 | task_id = solution["task_id"] |
| 152 | dbg_identifier = solution["_identifier"] |
| 153 | if debug_task is not None and task_id != debug_task: |
| 154 | continue |
| 155 | |
| 156 | ntotal += 1 |
| 157 | if "solution" in solution: |
| 158 | old_code = solution["solution"] |
| 159 | else: |
| 160 | assert "completion" in solution |
| 161 | old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"] |
| 162 | |
| 163 | old_code = old_code.strip() |
| 164 | |
| 165 | new_code = sanitize( |
| 166 | old_code=old_code, |
| 167 | entry_point=entry_point[task_id], |
| 168 | rm_prefix_lines=rm_prefix_lines, |
| 169 | eofs=eofs, |
| 170 | ).strip() |
| 171 | |
| 172 | # if changed, print the message |
| 173 | if new_code != old_code: |
| 174 | msg = "Sanitized: " + dbg_identifier |
| 175 | if is_folder: |
| 176 | msg += " -> " + dbg_identifier.replace(samples, target_path) |
nothing calls this directly
no test coverage detected