(responses: list[dict[int, SimpleNamespace]], block_size: int)
| 118 | |
| 119 | |
| 120 | def _print_decode_summary(responses: list[dict[int, SimpleNamespace]], block_size: int) -> None: |
| 121 | baseline_tpot = np.mean([r[1].time_per_output_token for r in responses]) |
| 122 | dflash_tpot = np.mean([r[block_size].time_per_output_token for r in responses]) |
| 123 | print(f"Baseline throughput: {1 / baseline_tpot:.2f} tok/s") |
| 124 | print(f"DFlash throughput: {1 / dflash_tpot:.2f} tok/s") |
| 125 | print(f"Decoding speedup: {baseline_tpot / dflash_tpot:.2f}") |
| 126 | |
| 127 | mean_accept = np.mean([np.mean(r[block_size].acceptance_lengths) for r in responses]) |
| 128 | print(f"Average Acceptance length: {mean_accept:.2f}") |
| 129 | |
| 130 | acceptance_lengths = list(chain.from_iterable(r[block_size].acceptance_lengths for r in responses)) |
| 131 | histogram = [acceptance_lengths.count(b) / len(acceptance_lengths) for b in range(block_size + 1)] |
| 132 | print(f"Acceptance length histogram: {[f'{x * 100:.1f}%' for x in histogram]}") |
| 133 | |
| 134 | |
| 135 | def _env_int(name: str, default: int) -> int: |
no outgoing calls
no test coverage detected