In the synthesizer file, each line includes a set of input generators for a task. The goal of this script is to use these generators to sample inputs for each task. The generated inputs are expected to be valid.
(input: str, output: str)
| 179 | |
| 180 | |
| 181 | def main(input: str, output: str): |
| 182 | """In the synthesizer file, each line includes a set of input generators for a task. |
| 183 | The goal of this script is to use these generators to sample inputs for each task. |
| 184 | The generated inputs are expected to be valid. |
| 185 | """ |
| 186 | assert output.endswith(".jsonl"), "output must be a .jsonl file" |
| 187 | |
| 188 | id2task = {} |
| 189 | for task_id, item in get_human_eval_plus().items(): |
| 190 | id2task[task_id] = item |
| 191 | |
| 192 | for task_id, item in get_mbpp_plus().items(): |
| 193 | id2task[task_id] = item |
| 194 | |
| 195 | # loading the synthesizers |
| 196 | with open(input, "r") as f: |
| 197 | synthesizers = [json.loads(l) for l in f] |
| 198 | |
| 199 | n_total = 0 |
| 200 | n_parsed = 0 |
| 201 | n_dedup = 0 |
| 202 | |
| 203 | for item in synthesizers: |
| 204 | item["synthesizers"] = [post_process(s) for s in item["synthesizers"]] |
| 205 | n_total += len(item["synthesizers"]) |
| 206 | item["synthesizers"] = [s for s in item["synthesizers"] if s is not None] |
| 207 | n_parsed += len(item["synthesizers"]) |
| 208 | |
| 209 | dedup_set = set() |
| 210 | for s in item["synthesizers"]: |
| 211 | dedup_set.add( |
| 212 | "\n".join( |
| 213 | [l for l in s.splitlines() if l.strip() and not l.startswith("#")] |
| 214 | ) |
| 215 | ) |
| 216 | item["synthesizers"] = list(dedup_set) |
| 217 | n_dedup += len(item["synthesizers"]) |
| 218 | |
| 219 | print( |
| 220 | colored( |
| 221 | f"#Total {n_total} with {n_parsed} parsed => {100 * (1 - n_parsed / n_total) :.1f}% syntax err", |
| 222 | "green", |
| 223 | ) |
| 224 | ) |
| 225 | |
| 226 | print( |
| 227 | colored( |
| 228 | f"#Parsed {n_parsed} with {n_dedup} dedup => {100 * (1 - n_dedup / n_parsed) :.1f}% duplicate", |
| 229 | "green", |
| 230 | ) |
| 231 | ) |
| 232 | |
| 233 | # resume mode check finished tasks |
| 234 | finished_tasks = set() |
| 235 | if os.path.isfile(output): |
| 236 | with open(output, "r") as f: |
| 237 | for l in f: |
| 238 | item = json.loads(l) |
nothing calls this directly
no test coverage detected