()
| 430 | |
| 431 | |
| 432 | def parse_args(): |
| 433 | parser = argparse.ArgumentParser(description='Benchmark guided decoding (response_format) overhead vs. baseline', |
| 434 | formatter_class=DefaultsAndTypesHelpFormatter) |
| 435 | parser.add_argument('dataset', type=str, help='Path to the ShareGPT dataset') |
| 436 | parser.add_argument('model_path', |
| 437 | type=str, |
| 438 | help='Path of the model in localhost or repo_id on huggingface.co') |
| 439 | parser.add_argument('-c', '--concurrency', type=int, help='Max batch size', default=256) |
| 440 | parser.add_argument('-n', '--num-prompts', type=int, help='Number of prompts', default=1000) |
| 441 | parser.add_argument('--csv', type=str, help='Save results to CSV', default='') |
| 442 | parser.add_argument('--seed', type=int, default=0) |
| 443 | parser.add_argument('--stream-output', action='store_true', help='Use streaming output') |
| 444 | parser.add_argument('--dataset-name', type=str, default='sharegpt', choices=['sharegpt', 'random']) |
| 445 | parser.add_argument('--sharegpt-output-len', type=int, default=None) |
| 446 | parser.add_argument('--random-input-len', type=int, default=None) |
| 447 | parser.add_argument('--random-output-len', type=int, default=None) |
| 448 | parser.add_argument('--random-range-ratio', type=float, default=0.0) |
| 449 | |
| 450 | # guided decoding |
| 451 | guided = parser.add_argument_group('Guided decoding arguments') |
| 452 | guided.add_argument( |
| 453 | '--response-format', |
| 454 | type=str, |
| 455 | required=True, |
| 456 | choices=['json_schema', 'json_object', 'regex_schema'], |
| 457 | help='Type of response_format (required).') |
| 458 | guided.add_argument( |
| 459 | '--json-schema-path', |
| 460 | type=str, |
| 461 | default=None, |
| 462 | help='Path to a JSON schema file. Uses a built-in default if omitted.') |
| 463 | guided.add_argument( |
| 464 | '--regex-schema', |
| 465 | type=str, |
| 466 | default=None, |
| 467 | help='Regex pattern. Uses a built-in default if omitted.') |
| 468 | guided.add_argument( |
| 469 | '--ignore-eos', |
| 470 | action='store_true', |
| 471 | help='Force max_new_tokens output (ignore EOS). ' |
| 472 | 'Isolates pure per-step grammar bitmask overhead. ' |
| 473 | 'Without this flag, both runs stop naturally (production mode).') |
| 474 | guided.add_argument( |
| 475 | '--no-baseline', |
| 476 | action='store_true', |
| 477 | help='Skip the baseline run; only benchmark guided decoding.') |
| 478 | |
| 479 | # engine / sampling |
| 480 | ArgumentHelper.top_p(parser) |
| 481 | ArgumentHelper.temperature(parser) |
| 482 | ArgumentHelper.top_k(parser) |
| 483 | ArgumentHelper.log_level(parser) |
| 484 | ArgumentHelper.backend(parser) |
| 485 | |
| 486 | pt_group = parser.add_argument_group('PyTorch engine arguments') |
| 487 | ArgumentHelper.eager_mode(pt_group) |
| 488 | ArgumentHelper.enable_return_routed_experts(pt_group) |
| 489 | tp_act = ArgumentHelper.tp(pt_group) |
no test coverage detected